From 2bb03dd10470ba150047ff2ef3a036d6450dc855 Mon Sep 17 00:00:00 2001 From: tastytea Date: Wed, 25 Dec 2019 02:08:53 +0100 Subject: [PATCH] Extract and parse s from RSS feeds. --- src/document.cpp | 103 +++++++++++++++++++++++++++++++++++++++++++++-- src/document.hpp | 36 ++++++++++++++++- src/main.cpp | 11 +++-- 3 files changed, 141 insertions(+), 9 deletions(-) diff --git a/src/document.cpp b/src/document.cpp index bd7531b..665a6b3 100644 --- a/src/document.cpp +++ b/src/document.cpp @@ -19,19 +19,28 @@ #include "version.hpp" #include +#include +#include +#include #include #include +#include +#include #include #include namespace mastorss { +using boost::regex; +using boost::regex_replace; +using std::list; +using std::istringstream; using std::string; using std::move; -Document::Document(string uri) - : _uri{move(uri)} +Document::Document(const ProfileData &data) + : _data{data} { RestClient::init(); @@ -45,7 +54,7 @@ Document::~Document() void Document::download() { - RestClient::Connection connection(_uri); + RestClient::Connection connection(_data.feedurl); connection.SetUserAgent(string("mastorss/").append(version)); connection.FollowRedirects(true, 10); @@ -56,7 +65,7 @@ void Document::download() case 200: { _raw_doc = response.body; - BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _uri; + BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _data.feedurl; break; } case 301: @@ -76,4 +85,90 @@ void Document::download() } } } + +void Document::parse() +{ + pt::ptree tree; + istringstream iss{_raw_doc}; + pt::read_xml(iss, tree); + + if (tree.front().first == "rss") + { + parse_rss(tree); + } +} + +void Document::parse_rss(const pt::ptree &tree) +{ + for (const auto &child : tree.get_child("rss.channel")) + { + if (child.first == "item") + { + const auto &rssitem = child.second; + + string guid{rssitem.get("guid")}; + if (guid.empty()) // We hope either or are present. + { + guid = rssitem.get("link"); + } + if (guid == _data.last_guid) + { + break; + } + + bool skipthis{false}; + string title = rssitem.get("title"); + for (const auto &skip : _data.skip) + { + if (title.substr(0, skip.length()) == skip) + { + skipthis = true; + break; + } + } + if (skipthis) + { + BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid; + continue; + } + + Item item; + item.description = remove_html(rssitem.get("description")); + item.guid = move(guid); + item.link = rssitem.get("link"); + item.title = move(title); + new_items.push_back(item); + + BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid; + } + } +} + +string Document::remove_html(string html) const +{ + html = Mastodon::unescape_html(html); // Decode HTML entities. + + html = regex_replace(html, regex{"

"}, "\n\n"); + + const list re_list + { + regex{R"()"}, // CDATA end. + regex{"<[^>]+>"}, // HTML tags. + regex{R"(\r)"}, // Carriage return. + regex{"\\n[ \\t\u00a0]+\\n"}, // Whitespace between newlines. + regex{R"(^\n+)"} // Newlines at the beginning. + }; + for (const regex &re : re_list) + { + html = regex_replace(html, re, ""); + } + + // Remove excess newlines. + html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n"); + // Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead). + html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " "); + + return html; +} } // namespace mastorss diff --git a/src/document.hpp b/src/document.hpp index 1a5ddf1..b69267b 100644 --- a/src/document.hpp +++ b/src/document.hpp @@ -17,27 +17,59 @@ #ifndef MASTORSS_DOCUMENT_HPP #define MASTORSS_DOCUMENT_HPP +#include "config.hpp" + +#include + #include +#include namespace mastorss { +namespace pt = boost::property_tree; using std::string; +using std::vector; +/*! + * @brief An Item of a feed. + * + * @since 0.10.0 + */ +struct Item +{ + string description; + string guid; + string link; + string title; +}; + +/*! + * @brief A feed. + * + * @since 0.10.0 + */ class Document { public: - explicit Document(string uri); + explicit Document(const ProfileData &data); ~Document(); Document(const Document &other) = default; Document &operator=(const Document &other) = delete; Document(Document &&other) = default; Document &operator=(Document &&other) = delete; + vector new_items; + void download(); + void parse(); private: - const string _uri; + const ProfileData &_data; string _raw_doc; + + void parse_rss(const pt::ptree &tree); + [[nodiscard]] + string remove_html(string html) const; }; } // namespace mastorss diff --git a/src/main.cpp b/src/main.cpp index 0f6d1de..4da5a97 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -85,13 +85,18 @@ int main(int argc, char *argv[]) } else { - const string_view profile = args[1]; + const string_view profile{args[1]}; BOOST_LOG_TRIVIAL(debug) << "Using profile: " << profile; try { - Config cfg(profile.data()); - Document doc(cfg.data.feedurl); + Config cfg{profile.data()}; + Document doc{cfg.data}; + doc.parse(); + for (const auto &item : doc.new_items) + { + cout << "--\n" << item.description.substr(0, 200) << "\n"; + } } catch (const FileException &e) {