2019-12-20 01:14:23 +01:00
|
|
|
/* This file is part of mastorss.
|
2020-06-19 22:20:03 +02:00
|
|
|
* Copyright © 2019, 2020 tastytea <tastytea@tastytea.de>
|
2019-12-20 01:14:23 +01:00
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "document.hpp"
|
2020-06-19 23:04:42 +02:00
|
|
|
|
2020-11-21 22:30:24 +01:00
|
|
|
#include "curl_wrapper.hpp"
|
2019-12-20 01:14:23 +01:00
|
|
|
#include "exceptions.hpp"
|
|
|
|
#include "version.hpp"
|
|
|
|
|
2019-12-24 18:53:45 +01:00
|
|
|
#include <boost/log/trivial.hpp>
|
2019-12-25 02:08:53 +01:00
|
|
|
#include <boost/property_tree/xml_parser.hpp>
|
|
|
|
#include <boost/regex.hpp>
|
2019-12-28 09:48:42 +01:00
|
|
|
#include <json/json.h>
|
2020-01-15 19:05:49 +01:00
|
|
|
#include <mastodonpp/mastodonpp.hpp>
|
2019-12-20 01:14:23 +01:00
|
|
|
|
2019-12-28 09:48:42 +01:00
|
|
|
#include <algorithm>
|
|
|
|
#include <fstream>
|
2019-12-25 02:08:53 +01:00
|
|
|
#include <sstream>
|
2020-08-24 18:14:12 +02:00
|
|
|
#include <stdexcept>
|
2019-12-20 01:14:23 +01:00
|
|
|
#include <string>
|
|
|
|
#include <utility>
|
|
|
|
|
2019-12-24 18:53:45 +01:00
|
|
|
namespace mastorss
|
|
|
|
{
|
2019-12-25 02:08:53 +01:00
|
|
|
using boost::regex;
|
|
|
|
using boost::regex_replace;
|
2019-12-29 04:00:48 +01:00
|
|
|
using std::any_of;
|
2019-12-28 09:48:42 +01:00
|
|
|
using std::ifstream;
|
2019-12-25 02:08:53 +01:00
|
|
|
using std::istringstream;
|
2019-12-20 01:14:23 +01:00
|
|
|
using std::move;
|
2020-06-19 23:04:42 +02:00
|
|
|
using std::string;
|
|
|
|
using std::stringstream;
|
|
|
|
using std::transform;
|
2019-12-20 01:14:23 +01:00
|
|
|
|
2020-06-19 23:04:42 +02:00
|
|
|
bool operator!=(const Item &a, const Item &b)
|
2019-12-28 07:12:28 +01:00
|
|
|
{
|
|
|
|
return a.guid != b.guid;
|
|
|
|
}
|
|
|
|
|
2019-12-25 06:26:20 +01:00
|
|
|
Document::Document(Config &cfg)
|
|
|
|
: _cfg{cfg}
|
2019-12-28 09:10:32 +01:00
|
|
|
, _profiledata{_cfg.profiledata}
|
2019-12-20 01:14:23 +01:00
|
|
|
{
|
|
|
|
download();
|
|
|
|
}
|
|
|
|
|
2019-12-29 00:41:14 +01:00
|
|
|
void Document::download(const string &uri, const bool temp_redirect)
|
2019-12-20 01:14:23 +01:00
|
|
|
{
|
2020-11-21 22:30:24 +01:00
|
|
|
namespace cw = curl_wrapper;
|
|
|
|
|
2020-08-24 16:52:20 +02:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Downloading <" << uri << "> …";
|
2020-11-21 22:30:24 +01:00
|
|
|
cw::CURLWrapper curl;
|
|
|
|
curl.set_useragent(string("mastorss/") += version);
|
|
|
|
curl.set_maxredirs(0);
|
2019-12-20 01:14:23 +01:00
|
|
|
|
2020-11-21 22:30:24 +01:00
|
|
|
const auto answer{curl.make_http_request(cw::http_method::GET, uri)};
|
2019-12-20 01:14:23 +01:00
|
|
|
|
2020-11-21 22:30:24 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Got response: " << answer.status;
|
2020-08-24 16:52:20 +02:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Got Headers:";
|
2020-11-21 22:30:24 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << answer.headers;
|
2020-08-24 16:52:20 +02:00
|
|
|
|
2020-11-21 22:30:24 +01:00
|
|
|
switch (answer.status)
|
2019-12-20 01:14:23 +01:00
|
|
|
{
|
|
|
|
case 200:
|
|
|
|
{
|
2020-11-21 22:30:24 +01:00
|
|
|
_raw_doc = answer.body;
|
2019-12-28 09:10:32 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _profiledata.feedurl;
|
2019-12-20 01:14:23 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 301:
|
|
|
|
case 308:
|
|
|
|
{
|
2019-12-29 00:41:14 +01:00
|
|
|
if (temp_redirect)
|
|
|
|
{
|
2019-12-29 08:33:06 +01:00
|
|
|
goto temporary_redirect; // NOLINT(cppcoreguidelines-avoid-goto)
|
2019-12-29 00:41:14 +01:00
|
|
|
}
|
2020-11-21 22:30:24 +01:00
|
|
|
_profiledata.feedurl = extract_location(answer);
|
2019-12-28 09:10:32 +01:00
|
|
|
if (_profiledata.feedurl.empty())
|
2019-12-25 06:28:31 +01:00
|
|
|
{
|
2020-11-21 22:30:24 +01:00
|
|
|
throw HTTPException{answer.status};
|
2019-12-25 06:28:31 +01:00
|
|
|
}
|
|
|
|
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format off
|
2019-12-25 19:31:07 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Feed has new location (permanent): "
|
2019-12-28 09:10:32 +01:00
|
|
|
<< _profiledata.feedurl;
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format on
|
2019-12-25 06:28:31 +01:00
|
|
|
_cfg.write();
|
|
|
|
download();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case 302:
|
|
|
|
case 303:
|
|
|
|
case 307:
|
|
|
|
{
|
2020-08-24 16:52:20 +02:00
|
|
|
temporary_redirect:
|
2020-11-21 22:30:24 +01:00
|
|
|
const string newuri{extract_location(answer)};
|
2019-12-25 06:28:31 +01:00
|
|
|
if (newuri.empty())
|
|
|
|
{
|
2020-11-21 22:30:24 +01:00
|
|
|
throw HTTPException{answer.status};
|
2019-12-25 06:28:31 +01:00
|
|
|
}
|
|
|
|
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format off
|
2019-12-25 19:31:07 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Feed has new location (temporary): "
|
2019-12-29 00:49:23 +01:00
|
|
|
<< newuri;
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format on
|
2019-12-29 00:41:14 +01:00
|
|
|
download(newuri, true);
|
2019-12-25 06:28:31 +01:00
|
|
|
break;
|
2019-12-20 01:14:23 +01:00
|
|
|
}
|
|
|
|
default:
|
|
|
|
{
|
2020-11-21 22:30:24 +01:00
|
|
|
throw HTTPException{answer.status};
|
2019-12-20 01:14:23 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-12-25 02:08:53 +01:00
|
|
|
|
2019-12-25 06:28:31 +01:00
|
|
|
void Document::download()
|
|
|
|
{
|
2019-12-28 09:10:32 +01:00
|
|
|
download(_profiledata.feedurl);
|
2019-12-25 06:28:31 +01:00
|
|
|
}
|
|
|
|
|
2019-12-25 02:08:53 +01:00
|
|
|
void Document::parse()
|
|
|
|
{
|
2020-11-21 20:09:35 +01:00
|
|
|
if (_profiledata.add_hashtags)
|
|
|
|
{
|
|
|
|
parse_watchwords();
|
|
|
|
}
|
2019-12-25 02:08:53 +01:00
|
|
|
pt::ptree tree;
|
|
|
|
istringstream iss{_raw_doc};
|
|
|
|
pt::read_xml(iss, tree);
|
|
|
|
|
|
|
|
if (tree.front().first == "rss")
|
|
|
|
{
|
2019-12-25 02:42:47 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "RSS detected.";
|
2019-12-25 02:08:53 +01:00
|
|
|
parse_rss(tree);
|
|
|
|
}
|
2020-01-01 12:53:11 +01:00
|
|
|
else
|
|
|
|
{
|
|
|
|
throw ParseException{"Could not detect type of feed."};
|
|
|
|
}
|
2019-12-25 02:08:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void Document::parse_rss(const pt::ptree &tree)
|
|
|
|
{
|
2020-11-05 14:14:33 +01:00
|
|
|
size_t counter{0};
|
2019-12-25 02:08:53 +01:00
|
|
|
for (const auto &child : tree.get_child("rss.channel"))
|
|
|
|
{
|
2020-11-05 14:14:33 +01:00
|
|
|
if (counter == Config::max_guids)
|
|
|
|
{
|
|
|
|
BOOST_LOG_TRIVIAL(debug)
|
|
|
|
<< "Maximum number of items reached. Stopped parsing.";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++counter;
|
2019-12-25 02:08:53 +01:00
|
|
|
if (child.first == "item")
|
|
|
|
{
|
|
|
|
const auto &rssitem = child.second;
|
|
|
|
|
2019-12-28 23:33:28 +01:00
|
|
|
string guid{rssitem.get<string>("guid", "")};
|
2020-06-19 23:04:42 +02:00
|
|
|
if (guid.empty()) // We hope either <guid> or <link> are present.
|
2019-12-25 02:08:53 +01:00
|
|
|
{
|
|
|
|
guid = rssitem.get<string>("link");
|
|
|
|
}
|
2020-01-01 12:57:50 +01:00
|
|
|
if (any_of(_profiledata.guids.begin(), _profiledata.guids.end(),
|
2020-06-19 23:04:42 +02:00
|
|
|
[&](const auto &old_guid) { return guid == old_guid; }))
|
2019-12-25 02:08:53 +01:00
|
|
|
{
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format off
|
2020-01-01 13:44:54 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Found already posted GUID: "
|
|
|
|
<< guid;
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format on
|
2020-01-01 13:25:05 +01:00
|
|
|
if (_profiledata.keep_looking)
|
|
|
|
{
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Stopped parsing.";
|
2019-12-25 02:08:53 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-12-28 06:27:56 +01:00
|
|
|
string title{rssitem.get<string>("title")};
|
2019-12-29 04:00:48 +01:00
|
|
|
if (any_of(_profiledata.skip.begin(), _profiledata.skip.end(),
|
|
|
|
[&title](const string &skip)
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format off
|
2019-12-29 04:00:48 +01:00
|
|
|
{ return title.substr(0, skip.size()) == skip; }))
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format on
|
2019-12-25 02:08:53 +01:00
|
|
|
{
|
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Item item;
|
2019-12-28 06:27:56 +01:00
|
|
|
item.description = [&]
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format off
|
2019-12-28 06:27:56 +01:00
|
|
|
{
|
2019-12-28 23:31:10 +01:00
|
|
|
string desc{rssitem.get<string>("description")};
|
2019-12-28 09:10:32 +01:00
|
|
|
for (const auto &fix : _profiledata.fixes)
|
2019-12-25 02:42:27 +01:00
|
|
|
{
|
2019-12-28 06:27:56 +01:00
|
|
|
desc = regex_replace(desc, regex{fix}, "");
|
|
|
|
}
|
2019-12-28 23:31:10 +01:00
|
|
|
desc = remove_html(desc);
|
2020-10-29 15:06:51 +01:00
|
|
|
if (_profiledata.add_hashtags)
|
|
|
|
{
|
|
|
|
desc = add_hashtags(desc);
|
|
|
|
}
|
|
|
|
return desc;
|
2019-12-28 06:27:56 +01:00
|
|
|
}();
|
2020-06-19 23:04:42 +02:00
|
|
|
// clang-format on
|
2019-12-25 02:08:53 +01:00
|
|
|
item.guid = move(guid);
|
|
|
|
item.link = rssitem.get<string>("link");
|
2020-06-19 22:20:03 +02:00
|
|
|
item.title = mastodonpp::unescape_html(title);
|
2019-12-28 06:48:35 +01:00
|
|
|
new_items.push_front(item);
|
2019-12-25 02:08:53 +01:00
|
|
|
|
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;
|
2019-12-28 06:54:05 +01:00
|
|
|
|
2020-11-21 23:53:47 +01:00
|
|
|
if (_profiledata.guids.empty() && !_profiledata.keep_looking)
|
2019-12-28 06:54:05 +01:00
|
|
|
{
|
2019-12-28 07:16:15 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "This is the first run.";
|
2019-12-28 06:54:05 +01:00
|
|
|
break;
|
|
|
|
}
|
2019-12-25 02:08:53 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
string Document::remove_html(string html) const
|
|
|
|
{
|
2020-01-15 19:05:49 +01:00
|
|
|
html = mastodonpp::unescape_html(html); // Decode HTML entities.
|
2019-12-25 02:08:53 +01:00
|
|
|
|
|
|
|
html = regex_replace(html, regex{"<p>"}, "\n\n");
|
|
|
|
|
2020-06-19 23:04:42 +02:00
|
|
|
const list re_list{regex{R"(<!\[CDATA\[)"}, // CDATA beginning.
|
|
|
|
regex{R"(\]\]>)"}, // CDATA end.
|
|
|
|
regex{"<[^>]+>"}, // HTML tags.
|
|
|
|
regex{R"(\r)"}, // Carriage return.
|
|
|
|
regex{"\\n[ \\t\u00a0]+\\n"}, // Space between newlines.
|
|
|
|
regex{R"(^\n+)"}}; // Newlines at beginning.
|
2019-12-25 02:08:53 +01:00
|
|
|
for (const regex &re : re_list)
|
|
|
|
{
|
|
|
|
html = regex_replace(html, re, "");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove excess newlines.
|
|
|
|
html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");
|
|
|
|
// Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).
|
|
|
|
html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");
|
|
|
|
|
2019-12-25 02:42:47 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Converted HTML to text.";
|
|
|
|
|
2019-12-25 02:08:53 +01:00
|
|
|
return html;
|
|
|
|
}
|
2019-12-25 06:28:31 +01:00
|
|
|
|
2020-11-21 22:30:24 +01:00
|
|
|
string Document::extract_location(const curl_wrapper::answer &answer)
|
2019-12-25 06:28:31 +01:00
|
|
|
{
|
2020-11-21 22:43:41 +01:00
|
|
|
string location{answer.get_header("Location")};
|
2020-11-21 22:30:24 +01:00
|
|
|
|
|
|
|
if (location.empty())
|
2020-08-24 18:14:12 +02:00
|
|
|
{
|
2020-11-21 22:30:24 +01:00
|
|
|
throw std::runtime_error{"Could not extract new feed location."};
|
2020-08-24 18:14:12 +02:00
|
|
|
}
|
|
|
|
|
2019-12-25 06:28:31 +01:00
|
|
|
return location;
|
|
|
|
}
|
2019-12-28 09:48:42 +01:00
|
|
|
|
|
|
|
string Document::add_hashtags(const string &text)
|
2019-12-29 00:05:13 +01:00
|
|
|
{
|
|
|
|
string out{text};
|
|
|
|
for (const auto &tag : _watchwords)
|
|
|
|
{
|
2020-06-19 23:04:42 +02:00
|
|
|
regex re_tag("([[:space:]\u200b]|^)(" + tag
|
|
|
|
+ ")([[:space:]\u200b[:punct:]]|$)",
|
|
|
|
regex::icase);
|
2019-12-29 00:05:13 +01:00
|
|
|
out = regex_replace(out, re_tag, "$1#$2$3", boost::format_first_only);
|
|
|
|
}
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Document::parse_watchwords()
|
2019-12-28 09:48:42 +01:00
|
|
|
{
|
|
|
|
Json::Value json;
|
2019-12-28 09:58:49 +01:00
|
|
|
const auto filepath = _cfg.get_config_dir() /= "watchwords.json";
|
2020-11-21 23:21:02 +01:00
|
|
|
ifstream file(filepath.c_str());
|
2019-12-28 09:48:42 +01:00
|
|
|
if (file.good())
|
|
|
|
{
|
|
|
|
stringstream rawjson;
|
|
|
|
rawjson << file.rdbuf();
|
|
|
|
rawjson >> json;
|
2019-12-28 09:58:49 +01:00
|
|
|
BOOST_LOG_TRIVIAL(debug) << "Read " << filepath;
|
2019-12-28 09:48:42 +01:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2020-06-19 23:04:42 +02:00
|
|
|
BOOST_LOG_TRIVIAL(warning)
|
|
|
|
<< "File Not found: "
|
|
|
|
<< (_cfg.get_config_dir() /= "watchwords.json").string();
|
2020-01-03 03:26:00 +01:00
|
|
|
return;
|
2019-12-28 09:48:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
const auto &tags_profile = json[_cfg.profile]["tags"];
|
|
|
|
const auto &tags_global = json["global"]["tags"];
|
|
|
|
transform(tags_profile.begin(), tags_profile.end(),
|
2019-12-29 00:05:13 +01:00
|
|
|
std::back_inserter(_watchwords),
|
2019-12-28 09:48:42 +01:00
|
|
|
[](const Json::Value &value) { return value.asString(); });
|
|
|
|
transform(tags_global.begin(), tags_global.end(),
|
2019-12-29 00:05:13 +01:00
|
|
|
std::back_inserter(_watchwords),
|
2019-12-28 09:48:42 +01:00
|
|
|
[](const Json::Value &value) { return value.asString(); });
|
|
|
|
}
|
|
|
|
|
2019-12-24 18:53:45 +01:00
|
|
|
} // namespace mastorss
|