mastorss/src/parse.cpp

211 lines
7.6 KiB
C++
Raw Normal View History

2018-02-18 14:27:01 +01:00
/* This file is part of mastorss.
2019-04-21 04:00:55 +02:00
* Copyright © 2018, 2019 tastytea <tastytea@tastytea.de>
2018-02-18 14:27:01 +01:00
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include <vector>
#include <string>
#include <cstdint>
#include <regex>
#include <sstream>
2018-02-22 02:51:58 +01:00
#include <locale>
#include <codecvt>
2018-04-14 13:57:03 +02:00
#include <fstream>
#include <algorithm>
#include <iterator>
2018-04-14 13:57:03 +02:00
#include <jsoncpp/json/json.h>
2018-02-18 14:27:01 +01:00
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <mastodon-cpp/mastodon-cpp.hpp>
#include <mastodon-cpp/easy/all.hpp>
2018-02-18 14:27:01 +01:00
#include "mastorss.hpp"
using std::cerr;
using std::string;
2018-04-14 13:57:03 +02:00
namespace pt = boost::property_tree;
2018-02-18 14:27:01 +01:00
std::vector<Mastodon::Easy::Status> parse_feed(const string &xml)
2018-02-18 14:27:01 +01:00
{
2018-04-14 13:57:03 +02:00
Json::Value list;
2018-02-18 14:27:01 +01:00
std::vector<string> watchwords;
2018-04-14 13:57:03 +02:00
std::ifstream file(filepath + "watchwords.json");
if (file.is_open())
2018-02-18 14:27:01 +01:00
{
2018-04-14 13:57:03 +02:00
std::stringstream json;
json << file.rdbuf();
file.close();
json >> list;
2018-02-18 14:27:01 +01:00
}
2018-04-14 13:57:03 +02:00
else
2018-02-18 14:27:01 +01:00
{
cerr << "WARNING: " << filepath << "watchwords.json not found or not readable.\n";
2018-02-18 14:27:01 +01:00
}
2018-04-14 13:57:03 +02:00
// Read profile-specific hashtags or fail silently
const Json::Value &tags_profile = list[profile]["tags"];
std::transform(tags_profile.begin(), tags_profile.end(),
std::back_inserter(watchwords),
[](const Json::Value &value)
{ return value.asString(); });
2018-04-14 13:57:03 +02:00
// Read global hashtags or fail silently
const Json::Value &tags_global = list["global"]["tags"];
std::transform(tags_global.begin(), tags_global.end(),
std::back_inserter(watchwords),
[](const Json::Value &value)
{ return value.asString(); });
2018-02-18 14:27:01 +01:00
pt::ptree rss;
std::istringstream iss(xml);
pt::read_xml(iss, rss);
std::vector<Mastodon::Easy::Status> ret;
2018-02-18 14:27:01 +01:00
for (const pt::ptree::value_type &chanchild : rss.get_child("rss.channel"))
2018-02-18 14:27:01 +01:00
{
if (chanchild.second.size() > 0)
2018-02-18 14:27:01 +01:00
{
if (string(chanchild.first.data()).compare("item") == 0)
2018-02-18 14:27:01 +01:00
{
string title = chanchild.second.get_child("title").data();
string link = chanchild.second.get_child("link").data();
string desc = chanchild.second.get_child("description").data();
2018-04-14 14:10:14 +02:00
Mastodon::Easy::Status status;
string content = "";
if (config[profile]["titles_as_cw"].asBool())
{
2019-04-21 04:00:55 +02:00
status.spoiler_text(Mastodon::unescape_html(title));
}
else
{
content = title;
}
2018-04-14 14:35:21 +02:00
if (!config[profile]["titles_only"].asBool())
2018-04-14 14:10:14 +02:00
{
if (!content.empty())
{
content += "\n\n";
}
content += desc;
2018-05-07 23:09:21 +02:00
// Shrink overly long texts, to speed up replace operations
if (content.length() > 2000)
2018-05-07 23:09:21 +02:00
{
content.resize(2000);
2018-05-07 23:09:21 +02:00
}
2018-04-14 14:10:14 +02:00
}
2018-02-18 14:27:01 +01:00
bool skipthis = false;
2018-03-15 13:20:26 +01:00
try
2018-02-20 23:29:55 +01:00
{
2018-03-15 13:20:26 +01:00
// Skip entries beginning with this text
2018-04-14 14:10:14 +02:00
for (const Json::Value &v : config[profile]["skip"])
2018-03-15 13:20:26 +01:00
{
2018-04-14 13:57:03 +02:00
const string skip = v.asString();
2018-03-15 13:20:26 +01:00
if (!skip.empty())
{
if (title.compare(0, skip.length(), skip) == 0)
{
skipthis = true;
break;
2018-03-15 13:20:26 +01:00
}
}
}
}
catch (const std::exception &e)
{
// Node not found, no problem
2018-02-20 23:29:55 +01:00
}
if (skipthis)
{
continue;
}
2018-02-20 23:29:55 +01:00
2019-04-21 04:00:55 +02:00
content = Mastodon::unescape_html(content);
2018-02-22 02:51:58 +01:00
2018-03-15 13:20:26 +01:00
// Try to turn the HTML into human-readable text
2018-04-15 12:06:24 +02:00
std::regex reparagraph("<p>");
2018-02-18 14:27:01 +01:00
std::regex recdata1("<!\\[CDATA\\[");
std::regex recdata2("\\]\\]>");
std::regex restrip("<[^>]*>");
2018-02-20 23:29:55 +01:00
individual_fixes(content);
2018-02-18 14:27:01 +01:00
content = std::regex_replace(content, reparagraph, "\n\n");
content = std::regex_replace(content, recdata1, "");
content = std::regex_replace(content, recdata2, "");
content = std::regex_replace(content, restrip, "");
// remove \r
content = std::regex_replace(content, std::regex("\\r"), "");
// replace NO-BREAK SPACE with space (UTF-8: 0xc2a0)
content = std::regex_replace(content, std::regex("\u00a0"), " ");
// remove whitespace between newlines
content = std::regex_replace(content, std::regex("\\n[ \t]+\\n"), "");
// remove excess newlines
content = std::regex_replace(content, std::regex("\\n{3,}"), "\n\n");
2018-02-18 14:27:01 +01:00
for (const string &hashtag : watchwords)
{
std::regex rehashtag("([[:space:][:punct:]]|^)(" + hashtag
+ ")([[:space:][:punct:]]|$)",
std::regex_constants::icase);
content = std::regex_replace(content, rehashtag, "$1#$2$3",
std::regex_constants::format_first_only);
2018-02-18 14:27:01 +01:00
}
2018-02-23 00:35:11 +01:00
// Why is this necessary? Why does ##hashtag happen?
content = std::regex_replace(content, std::regex("##"), "#");
uint16_t appendix_size = config[profile]["append"].asString().length();
if ((status.spoiler_text().size() + content.size() + link.size() + appendix_size)
> static_cast<std::uint16_t>(max_size - 4))
2018-02-18 14:27:01 +01:00
{
content.resize((max_size - status.spoiler_text().size()
- link.size() - appendix_size - 4));
content.resize(content.rfind(' ')); // Cut at word boundary
content += " […]";
2018-02-18 14:27:01 +01:00
}
// Remove trailing newlines
while (content.back() == '\n' ||
content.back() == '\r')
{
content.resize(content.length() - 1);
}
2018-05-11 01:17:53 +02:00
content += "\n\n" + link;
2018-05-11 01:17:53 +02:00
if (!config[profile]["append"].empty())
{
content += "\n\n" + config[profile]["append"].asString();
2018-05-11 01:17:53 +02:00
}
status.content(content);
ret.push_back(status);
2018-02-18 14:27:01 +01:00
}
}
}
return ret;
}
2018-03-11 15:40:25 +01:00
2018-03-15 13:20:26 +01:00
// Read regular expressions from the config file and delete all matches.
2018-03-11 15:40:25 +01:00
void individual_fixes(string &str)
{
2018-04-14 14:10:14 +02:00
for (const Json::Value &v : config[profile]["fixes"])
2018-03-15 13:20:26 +01:00
{
2018-04-14 13:57:03 +02:00
std::regex refix(v.asString());
str = std::regex_replace(str, refix, "");
2018-03-15 13:20:26 +01:00
}
2018-03-11 15:40:25 +01:00
}