Extract and parse <item>s from RSS feeds.
This commit is contained in:
parent
91acfb0bc3
commit
2bb03dd104
103
src/document.cpp
103
src/document.cpp
|
@ -19,19 +19,28 @@
|
|||
#include "version.hpp"
|
||||
|
||||
#include <boost/log/trivial.hpp>
|
||||
#include <boost/property_tree/xml_parser.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
#include <mastodon-cpp/mastodon-cpp.hpp>
|
||||
#include <restclient-cpp/connection.h>
|
||||
#include <restclient-cpp/restclient.h>
|
||||
|
||||
#include <list>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
namespace mastorss
|
||||
{
|
||||
using boost::regex;
|
||||
using boost::regex_replace;
|
||||
using std::list;
|
||||
using std::istringstream;
|
||||
using std::string;
|
||||
using std::move;
|
||||
|
||||
Document::Document(string uri)
|
||||
: _uri{move(uri)}
|
||||
Document::Document(const ProfileData &data)
|
||||
: _data{data}
|
||||
{
|
||||
RestClient::init();
|
||||
|
||||
|
@ -45,7 +54,7 @@ Document::~Document()
|
|||
|
||||
void Document::download()
|
||||
{
|
||||
RestClient::Connection connection(_uri);
|
||||
RestClient::Connection connection(_data.feedurl);
|
||||
connection.SetUserAgent(string("mastorss/").append(version));
|
||||
connection.FollowRedirects(true, 10);
|
||||
|
||||
|
@ -56,7 +65,7 @@ void Document::download()
|
|||
case 200:
|
||||
{
|
||||
_raw_doc = response.body;
|
||||
BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _uri;
|
||||
BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _data.feedurl;
|
||||
break;
|
||||
}
|
||||
case 301:
|
||||
|
@ -76,4 +85,90 @@ void Document::download()
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Document::parse()
|
||||
{
|
||||
pt::ptree tree;
|
||||
istringstream iss{_raw_doc};
|
||||
pt::read_xml(iss, tree);
|
||||
|
||||
if (tree.front().first == "rss")
|
||||
{
|
||||
parse_rss(tree);
|
||||
}
|
||||
}
|
||||
|
||||
void Document::parse_rss(const pt::ptree &tree)
|
||||
{
|
||||
for (const auto &child : tree.get_child("rss.channel"))
|
||||
{
|
||||
if (child.first == "item")
|
||||
{
|
||||
const auto &rssitem = child.second;
|
||||
|
||||
string guid{rssitem.get<string>("guid")};
|
||||
if (guid.empty()) // We hope either <guid> or <link> are present.
|
||||
{
|
||||
guid = rssitem.get<string>("link");
|
||||
}
|
||||
if (guid == _data.last_guid)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
bool skipthis{false};
|
||||
string title = rssitem.get<string>("title");
|
||||
for (const auto &skip : _data.skip)
|
||||
{
|
||||
if (title.substr(0, skip.length()) == skip)
|
||||
{
|
||||
skipthis = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (skipthis)
|
||||
{
|
||||
BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;
|
||||
continue;
|
||||
}
|
||||
|
||||
Item item;
|
||||
item.description = remove_html(rssitem.get<string>("description"));
|
||||
item.guid = move(guid);
|
||||
item.link = rssitem.get<string>("link");
|
||||
item.title = move(title);
|
||||
new_items.push_back(item);
|
||||
|
||||
BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string Document::remove_html(string html) const
|
||||
{
|
||||
html = Mastodon::unescape_html(html); // Decode HTML entities.
|
||||
|
||||
html = regex_replace(html, regex{"<p>"}, "\n\n");
|
||||
|
||||
const list re_list
|
||||
{
|
||||
regex{R"(<!\[CDATA\[)"}, // CDATA beginning.
|
||||
regex{R"(\]\]>)"}, // CDATA end.
|
||||
regex{"<[^>]+>"}, // HTML tags.
|
||||
regex{R"(\r)"}, // Carriage return.
|
||||
regex{"\\n[ \\t\u00a0]+\\n"}, // Whitespace between newlines.
|
||||
regex{R"(^\n+)"} // Newlines at the beginning.
|
||||
};
|
||||
for (const regex &re : re_list)
|
||||
{
|
||||
html = regex_replace(html, re, "");
|
||||
}
|
||||
|
||||
// Remove excess newlines.
|
||||
html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");
|
||||
// Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).
|
||||
html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");
|
||||
|
||||
return html;
|
||||
}
|
||||
} // namespace mastorss
|
||||
|
|
|
@ -17,27 +17,59 @@
|
|||
#ifndef MASTORSS_DOCUMENT_HPP
|
||||
#define MASTORSS_DOCUMENT_HPP
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
#include <boost/property_tree/ptree.hpp>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace mastorss
|
||||
{
|
||||
namespace pt = boost::property_tree;
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
/*!
|
||||
* @brief An Item of a feed.
|
||||
*
|
||||
* @since 0.10.0
|
||||
*/
|
||||
struct Item
|
||||
{
|
||||
string description;
|
||||
string guid;
|
||||
string link;
|
||||
string title;
|
||||
};
|
||||
|
||||
/*!
|
||||
* @brief A feed.
|
||||
*
|
||||
* @since 0.10.0
|
||||
*/
|
||||
class Document
|
||||
{
|
||||
public:
|
||||
explicit Document(string uri);
|
||||
explicit Document(const ProfileData &data);
|
||||
~Document();
|
||||
Document(const Document &other) = default;
|
||||
Document &operator=(const Document &other) = delete;
|
||||
Document(Document &&other) = default;
|
||||
Document &operator=(Document &&other) = delete;
|
||||
|
||||
vector<Item> new_items;
|
||||
|
||||
void download();
|
||||
void parse();
|
||||
|
||||
private:
|
||||
const string _uri;
|
||||
const ProfileData &_data;
|
||||
string _raw_doc;
|
||||
|
||||
void parse_rss(const pt::ptree &tree);
|
||||
[[nodiscard]]
|
||||
string remove_html(string html) const;
|
||||
};
|
||||
} // namespace mastorss
|
||||
|
||||
|
|
11
src/main.cpp
11
src/main.cpp
|
@ -85,13 +85,18 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
else
|
||||
{
|
||||
const string_view profile = args[1];
|
||||
const string_view profile{args[1]};
|
||||
BOOST_LOG_TRIVIAL(debug) << "Using profile: " << profile;
|
||||
|
||||
try
|
||||
{
|
||||
Config cfg(profile.data());
|
||||
Document doc(cfg.data.feedurl);
|
||||
Config cfg{profile.data()};
|
||||
Document doc{cfg.data};
|
||||
doc.parse();
|
||||
for (const auto &item : doc.new_items)
|
||||
{
|
||||
cout << "--\n" << item.description.substr(0, 200) << "\n";
|
||||
}
|
||||
}
|
||||
catch (const FileException &e)
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue
Block a user