Unescaping some more HTML entities

This commit is contained in:
tastytea 2018-02-22 02:51:58 +01:00
parent d7302c58ee
commit 56691fbdaa
Signed by: tastytea
GPG Key ID: 59346E0EA35C67E5
3 changed files with 32 additions and 9 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.7) cmake_minimum_required (VERSION 3.7)
project (mastorss project (mastorss
VERSION 0.2.6 VERSION 0.3.0
LANGUAGES CXX LANGUAGES CXX
) )

View File

@ -13,9 +13,10 @@ extern std::uint16_t max_size;
extern const string filepath; extern const string filepath;
std::uint16_t read_config(pt::ptree &config, const string &profile, string &instance, string &access_token, string &feedurl); std::uint16_t read_config(pt::ptree &config, const string &profile, string &instance, string &access_token, string &feedurl);
std::vector<string> parse_website(const string &profile, const string &xml);
// http.cpp std::vector<string> parse_website(const string &profile, const string &xml);
void unescape_html(const string &str);
const std::uint16_t http_get(const string &feedurl, const std::uint16_t http_get(const string &feedurl,
string &answer, const string &useragent = ""); string &answer, const string &useragent = "");
void curlpp_init(); void curlpp_init();

View File

@ -20,6 +20,8 @@
#include <cstdint> #include <cstdint>
#include <regex> #include <regex>
#include <sstream> #include <sstream>
#include <locale>
#include <codecvt>
#include <boost/property_tree/ptree.hpp> #include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/json_parser.hpp> #include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/xml_parser.hpp> #include <boost/property_tree/xml_parser.hpp>
@ -32,6 +34,29 @@ namespace pt = boost::property_tree;
using std::cerr; using std::cerr;
using std::string; using std::string;
// Translate &#0123; to chars, translate some named entities to chars
void unescape_html(string &str)
{
string html = str;
str = "";
// Used to convert int to utf-8 char
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> u8c;
std::regex reentity("&#(\\d{4});");
std::smatch match;
while (std::regex_search(html, match, reentity))
{
str += match.prefix().str() + u8c.to_bytes(std::stoi(match[1].str()));
html = match.suffix().str();
}
str += html;
std::regex relt("&lt;");
std::regex regt("&gt;");
str = std::regex_replace(str, relt, "<");
str = std::regex_replace(str, regt, ">");
}
std::vector<string> parse_website(const string &profile, const string &xml) std::vector<string> parse_website(const string &profile, const string &xml)
{ {
pt::ptree json; pt::ptree json;
@ -94,9 +119,8 @@ std::vector<string> parse_website(const string &profile, const string &xml)
continue; continue;
} }
// Some feeds contain encoded xhtml-tags >:| unescape_html(str);
std::regex relt("&lt;");
std::regex regt("&gt;");
std::regex reparagraph("</p><p>"); std::regex reparagraph("</p><p>");
std::regex recdata1("<!\\[CDATA\\["); std::regex recdata1("<!\\[CDATA\\[");
std::regex recdata2("\\]\\]>"); std::regex recdata2("\\]\\]>");
@ -109,8 +133,6 @@ std::vector<string> parse_website(const string &profile, const string &xml)
// GG/BO closing // GG/BO closing
std::regex reggboclosing("Die von den einzelnen AutorInnen .*$"); std::regex reggboclosing("Die von den einzelnen AutorInnen .*$");
str = std::regex_replace(str, relt, "<");
str = std::regex_replace(str, regt, ">");
str = std::regex_replace(str, reparagraph, "\n\n"); str = std::regex_replace(str, reparagraph, "\n\n");
str = std::regex_replace(str, recdata1, ""); str = std::regex_replace(str, recdata1, "");
str = std::regex_replace(str, recdata2, ""); str = std::regex_replace(str, recdata2, "");