Unescaping some more HTML entities
This commit is contained in:
parent
d7302c58ee
commit
56691fbdaa
|
@ -1,6 +1,6 @@
|
||||||
cmake_minimum_required (VERSION 3.7)
|
cmake_minimum_required (VERSION 3.7)
|
||||||
project (mastorss
|
project (mastorss
|
||||||
VERSION 0.2.6
|
VERSION 0.3.0
|
||||||
LANGUAGES CXX
|
LANGUAGES CXX
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -13,9 +13,10 @@ extern std::uint16_t max_size;
|
||||||
extern const string filepath;
|
extern const string filepath;
|
||||||
|
|
||||||
std::uint16_t read_config(pt::ptree &config, const string &profile, string &instance, string &access_token, string &feedurl);
|
std::uint16_t read_config(pt::ptree &config, const string &profile, string &instance, string &access_token, string &feedurl);
|
||||||
std::vector<string> parse_website(const string &profile, const string &xml);
|
|
||||||
|
|
||||||
// http.cpp
|
std::vector<string> parse_website(const string &profile, const string &xml);
|
||||||
|
void unescape_html(const string &str);
|
||||||
|
|
||||||
const std::uint16_t http_get(const string &feedurl,
|
const std::uint16_t http_get(const string &feedurl,
|
||||||
string &answer, const string &useragent = "");
|
string &answer, const string &useragent = "");
|
||||||
void curlpp_init();
|
void curlpp_init();
|
||||||
|
|
|
@ -20,6 +20,8 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <locale>
|
||||||
|
#include <codecvt>
|
||||||
#include <boost/property_tree/ptree.hpp>
|
#include <boost/property_tree/ptree.hpp>
|
||||||
#include <boost/property_tree/json_parser.hpp>
|
#include <boost/property_tree/json_parser.hpp>
|
||||||
#include <boost/property_tree/xml_parser.hpp>
|
#include <boost/property_tree/xml_parser.hpp>
|
||||||
|
@ -32,6 +34,29 @@ namespace pt = boost::property_tree;
|
||||||
using std::cerr;
|
using std::cerr;
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
||||||
|
// Translate { to chars, translate some named entities to chars
|
||||||
|
void unescape_html(string &str)
|
||||||
|
{
|
||||||
|
string html = str;
|
||||||
|
str = "";
|
||||||
|
// Used to convert int to utf-8 char
|
||||||
|
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> u8c;
|
||||||
|
std::regex reentity("&#(\\d{4});");
|
||||||
|
std::smatch match;
|
||||||
|
|
||||||
|
while (std::regex_search(html, match, reentity))
|
||||||
|
{
|
||||||
|
str += match.prefix().str() + u8c.to_bytes(std::stoi(match[1].str()));
|
||||||
|
html = match.suffix().str();
|
||||||
|
}
|
||||||
|
str += html;
|
||||||
|
|
||||||
|
std::regex relt("<");
|
||||||
|
std::regex regt(">");
|
||||||
|
str = std::regex_replace(str, relt, "<");
|
||||||
|
str = std::regex_replace(str, regt, ">");
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<string> parse_website(const string &profile, const string &xml)
|
std::vector<string> parse_website(const string &profile, const string &xml)
|
||||||
{
|
{
|
||||||
pt::ptree json;
|
pt::ptree json;
|
||||||
|
@ -89,14 +114,13 @@ std::vector<string> parse_website(const string &profile, const string &xml)
|
||||||
string str = title + "\n\n" + desc;
|
string str = title + "\n\n" + desc;
|
||||||
|
|
||||||
// ANF News puts this always on top, causing us to think it's new
|
// ANF News puts this always on top, causing us to think it's new
|
||||||
if (title.compare("Newsticker zu den Angriffen auf Efrîn") == 0)
|
if (title.compare("Newsticker zu den Angriffen auf Efrîn ") == 0)
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Some feeds contain encoded xhtml-tags >:|
|
unescape_html(str);
|
||||||
std::regex relt("<");
|
|
||||||
std::regex regt(">");
|
|
||||||
std::regex reparagraph("</p><p>");
|
std::regex reparagraph("</p><p>");
|
||||||
std::regex recdata1("<!\\[CDATA\\[");
|
std::regex recdata1("<!\\[CDATA\\[");
|
||||||
std::regex recdata2("\\]\\]>");
|
std::regex recdata2("\\]\\]>");
|
||||||
|
@ -109,8 +133,6 @@ std::vector<string> parse_website(const string &profile, const string &xml)
|
||||||
// GG/BO closing
|
// GG/BO closing
|
||||||
std::regex reggboclosing("Die von den einzelnen AutorInnen .*$");
|
std::regex reggboclosing("Die von den einzelnen AutorInnen .*$");
|
||||||
|
|
||||||
str = std::regex_replace(str, relt, "<");
|
|
||||||
str = std::regex_replace(str, regt, ">");
|
|
||||||
str = std::regex_replace(str, reparagraph, "\n\n");
|
str = std::regex_replace(str, reparagraph, "\n\n");
|
||||||
str = std::regex_replace(str, recdata1, "");
|
str = std::regex_replace(str, recdata1, "");
|
||||||
str = std::regex_replace(str, recdata2, "");
|
str = std::regex_replace(str, recdata2, "");
|
||||||
|
|
Loading…
Reference in New Issue
Block a user