Extract and parse <item>s from RSS feeds.

This commit is contained in:
tastytea 2019-12-25 02:08:53 +01:00
parent 91acfb0bc3
commit 2bb03dd104
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
3 changed files with 141 additions and 9 deletions

View File

@ -19,19 +19,28 @@
#include "version.hpp" #include "version.hpp"
#include <boost/log/trivial.hpp> #include <boost/log/trivial.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <boost/regex.hpp>
#include <mastodon-cpp/mastodon-cpp.hpp>
#include <restclient-cpp/connection.h> #include <restclient-cpp/connection.h>
#include <restclient-cpp/restclient.h> #include <restclient-cpp/restclient.h>
#include <list>
#include <sstream>
#include <string> #include <string>
#include <utility> #include <utility>
namespace mastorss namespace mastorss
{ {
using boost::regex;
using boost::regex_replace;
using std::list;
using std::istringstream;
using std::string; using std::string;
using std::move; using std::move;
Document::Document(string uri) Document::Document(const ProfileData &data)
: _uri{move(uri)} : _data{data}
{ {
RestClient::init(); RestClient::init();
@ -45,7 +54,7 @@ Document::~Document()
void Document::download() void Document::download()
{ {
RestClient::Connection connection(_uri); RestClient::Connection connection(_data.feedurl);
connection.SetUserAgent(string("mastorss/").append(version)); connection.SetUserAgent(string("mastorss/").append(version));
connection.FollowRedirects(true, 10); connection.FollowRedirects(true, 10);
@ -56,7 +65,7 @@ void Document::download()
case 200: case 200:
{ {
_raw_doc = response.body; _raw_doc = response.body;
BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _uri; BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _data.feedurl;
break; break;
} }
case 301: case 301:
@ -76,4 +85,90 @@ void Document::download()
} }
} }
} }
void Document::parse()
{
pt::ptree tree;
istringstream iss{_raw_doc};
pt::read_xml(iss, tree);
if (tree.front().first == "rss")
{
parse_rss(tree);
}
}
void Document::parse_rss(const pt::ptree &tree)
{
for (const auto &child : tree.get_child("rss.channel"))
{
if (child.first == "item")
{
const auto &rssitem = child.second;
string guid{rssitem.get<string>("guid")};
if (guid.empty()) // We hope either <guid> or <link> are present.
{
guid = rssitem.get<string>("link");
}
if (guid == _data.last_guid)
{
break;
}
bool skipthis{false};
string title = rssitem.get<string>("title");
for (const auto &skip : _data.skip)
{
if (title.substr(0, skip.length()) == skip)
{
skipthis = true;
break;
}
}
if (skipthis)
{
BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;
continue;
}
Item item;
item.description = remove_html(rssitem.get<string>("description"));
item.guid = move(guid);
item.link = rssitem.get<string>("link");
item.title = move(title);
new_items.push_back(item);
BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;
}
}
}
string Document::remove_html(string html) const
{
html = Mastodon::unescape_html(html); // Decode HTML entities.
html = regex_replace(html, regex{"<p>"}, "\n\n");
const list re_list
{
regex{R"(<!\[CDATA\[)"}, // CDATA beginning.
regex{R"(\]\]>)"}, // CDATA end.
regex{"<[^>]+>"}, // HTML tags.
regex{R"(\r)"}, // Carriage return.
regex{"\\n[ \\t\u00a0]+\\n"}, // Whitespace between newlines.
regex{R"(^\n+)"} // Newlines at the beginning.
};
for (const regex &re : re_list)
{
html = regex_replace(html, re, "");
}
// Remove excess newlines.
html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");
// Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).
html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");
return html;
}
} // namespace mastorss } // namespace mastorss

View File

@ -17,27 +17,59 @@
#ifndef MASTORSS_DOCUMENT_HPP #ifndef MASTORSS_DOCUMENT_HPP
#define MASTORSS_DOCUMENT_HPP #define MASTORSS_DOCUMENT_HPP
#include "config.hpp"
#include <boost/property_tree/ptree.hpp>
#include <string> #include <string>
#include <vector>
namespace mastorss namespace mastorss
{ {
namespace pt = boost::property_tree;
using std::string; using std::string;
using std::vector;
/*!
* @brief An Item of a feed.
*
* @since 0.10.0
*/
struct Item
{
string description;
string guid;
string link;
string title;
};
/*!
* @brief A feed.
*
* @since 0.10.0
*/
class Document class Document
{ {
public: public:
explicit Document(string uri); explicit Document(const ProfileData &data);
~Document(); ~Document();
Document(const Document &other) = default; Document(const Document &other) = default;
Document &operator=(const Document &other) = delete; Document &operator=(const Document &other) = delete;
Document(Document &&other) = default; Document(Document &&other) = default;
Document &operator=(Document &&other) = delete; Document &operator=(Document &&other) = delete;
vector<Item> new_items;
void download(); void download();
void parse();
private: private:
const string _uri; const ProfileData &_data;
string _raw_doc; string _raw_doc;
void parse_rss(const pt::ptree &tree);
[[nodiscard]]
string remove_html(string html) const;
}; };
} // namespace mastorss } // namespace mastorss

View File

@ -85,13 +85,18 @@ int main(int argc, char *argv[])
} }
else else
{ {
const string_view profile = args[1]; const string_view profile{args[1]};
BOOST_LOG_TRIVIAL(debug) << "Using profile: " << profile; BOOST_LOG_TRIVIAL(debug) << "Using profile: " << profile;
try try
{ {
Config cfg(profile.data()); Config cfg{profile.data()};
Document doc(cfg.data.feedurl); Document doc{cfg.data};
doc.parse();
for (const auto &item : doc.new_items)
{
cout << "--\n" << item.description.substr(0, 200) << "\n";
}
} }
catch (const FileException &e) catch (const FileException &e)
{ {