Unescaping some more HTML entities

This commit is contained in:
tastytea 2018-02-22 02:51:58 +01:00
parent d7302c58ee
commit 56691fbdaa
Signed by: tastytea
GPG Key ID: 59346E0EA35C67E5
3 changed files with 32 additions and 9 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.7)
project (mastorss
VERSION 0.2.6
VERSION 0.3.0
LANGUAGES CXX
)

View File

@ -13,9 +13,10 @@ extern std::uint16_t max_size;
extern const string filepath;
std::uint16_t read_config(pt::ptree &config, const string &profile, string &instance, string &access_token, string &feedurl);
std::vector<string> parse_website(const string &profile, const string &xml);
// http.cpp
std::vector<string> parse_website(const string &profile, const string &xml);
void unescape_html(const string &str);
const std::uint16_t http_get(const string &feedurl,
string &answer, const string &useragent = "");
void curlpp_init();

View File

@ -20,6 +20,8 @@
#include <cstdint>
#include <regex>
#include <sstream>
#include <locale>
#include <codecvt>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/xml_parser.hpp>
@ -32,6 +34,29 @@ namespace pt = boost::property_tree;
using std::cerr;
using std::string;
// Translate &#0123; to chars, translate some named entities to chars
void unescape_html(string &str)
{
string html = str;
str = "";
// Used to convert int to utf-8 char
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> u8c;
std::regex reentity("&#(\\d{4});");
std::smatch match;
while (std::regex_search(html, match, reentity))
{
str += match.prefix().str() + u8c.to_bytes(std::stoi(match[1].str()));
html = match.suffix().str();
}
str += html;
std::regex relt("&lt;");
std::regex regt("&gt;");
str = std::regex_replace(str, relt, "<");
str = std::regex_replace(str, regt, ">");
}
std::vector<string> parse_website(const string &profile, const string &xml)
{
pt::ptree json;
@ -89,14 +114,13 @@ std::vector<string> parse_website(const string &profile, const string &xml)
string str = title + "\n\n" + desc;
// ANF News puts this always on top, causing us to think it's new
if (title.compare("Newsticker zu den Angriffen auf Efrîn") == 0)
if (title.compare("Newsticker zu den Angriffen auf Efrîn ") == 0)
{
continue;
}
// Some feeds contain encoded xhtml-tags >:|
std::regex relt("&lt;");
std::regex regt("&gt;");
unescape_html(str);
std::regex reparagraph("</p><p>");
std::regex recdata1("<!\\[CDATA\\[");
std::regex recdata2("\\]\\]>");
@ -109,8 +133,6 @@ std::vector<string> parse_website(const string &profile, const string &xml)
// GG/BO closing
std::regex reggboclosing("Die von den einzelnen AutorInnen .*$");
str = std::regex_replace(str, relt, "<");
str = std::regex_replace(str, regt, ">");
str = std::regex_replace(str, reparagraph, "\n\n");
str = std::regex_replace(str, recdata1, "");
str = std::regex_replace(str, recdata2, "");