Unescaping some more HTML entities

2018-02-22 02:51:58 +01:00 · 2018-02-22 02:51:58 +01:00 · 56691fbdaa
commit 56691fbdaa
parent d7302c58ee
3 changed files with 32 additions and 9 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.7)
 project (mastorss
-         VERSION 0.2.6
+         VERSION 0.3.0
         LANGUAGES CXX
 )

--- a/src/mastorss.hpp
+++ b/src/mastorss.hpp
@ -13,9 +13,10 @@ extern std::uint16_t max_size;
 extern const string filepath;

 std::uint16_t read_config(pt::ptree &config, const string &profile, string &instance, string &access_token, string &feedurl);
-std::vector<string> parse_website(const string &profile, const string &xml);

-// http.cpp
+std::vector<string> parse_website(const string &profile, const string &xml);
+void unescape_html(const string &str);
+
 const std::uint16_t http_get(const string &feedurl,
                             string &answer, const string &useragent = "");
 void curlpp_init();
--- a/src/parse.cpp
+++ b/src/parse.cpp
@ -20,6 +20,8 @@
 #include <cstdint>
 #include <regex>
 #include <sstream>
+#include <locale>
+#include <codecvt>
 #include <boost/property_tree/ptree.hpp>
 #include <boost/property_tree/json_parser.hpp>
 #include <boost/property_tree/xml_parser.hpp>
@ -32,6 +34,29 @@ namespace pt = boost::property_tree;
 using std::cerr;
 using std::string;

+// Translate &#0123; to chars, translate some named entities to chars
+void unescape_html(string &str)
+{
+    string html = str;
+    str = "";
+    // Used to convert int to utf-8 char
+    std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> u8c;
+    std::regex reentity("&#(\\d{4});");
+    std::smatch match;
+    
+    while (std::regex_search(html, match, reentity))
+    {
+        str += match.prefix().str() + u8c.to_bytes(std::stoi(match[1].str()));
+        html = match.suffix().str();
+    }
+    str += html;
+
+    std::regex relt("&lt;");
+    std::regex regt("&gt;");
+    str = std::regex_replace(str, relt, "<");
+    str = std::regex_replace(str, regt, ">");
+}
+
 std::vector<string> parse_website(const string &profile, const string &xml)
 {
    pt::ptree json;
@ -89,14 +114,13 @@ std::vector<string> parse_website(const string &profile, const string &xml)
                string str = title + "\n\n" + desc;

                // ANF News puts this always on top, causing us to think it's new
-                if (title.compare("Newsticker zu den Angriffen auf Efrîn") == 0)
+                if (title.compare("Newsticker zu den Angriffen auf Efrîn ") == 0)
                {
                    continue;
                }

-                // Some feeds contain encoded xhtml-tags >:|
-                std::regex relt("&lt;");
-                std::regex regt("&gt;");
+                unescape_html(str);
+
                std::regex reparagraph("</p><p>");
                std::regex recdata1("<!\\[CDATA\\[");
                std::regex recdata2("\\]\\]>");
@ -109,8 +133,6 @@ std::vector<string> parse_website(const string &profile, const string &xml)
                // GG/BO closing
                std::regex reggboclosing("Die von den einzelnen AutorInnen .*$");

-                str = std::regex_replace(str, relt, "<");
-                str = std::regex_replace(str, regt, ">");
                str = std::regex_replace(str, reparagraph, "\n\n");
                str = std::regex_replace(str, recdata1, "");
                str = std::regex_replace(str, recdata2, "");