mastorss/src/parse.cpp

/*  This file is part of mastorss.
 *  Copyright © 2018, 2019 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <iostream>
#include <vector>
#include <string>
#include <cstdint>
#include <regex>
#include <sstream>
#include <locale>
#include <codecvt>
#include <fstream>
#include <algorithm>
#include <iterator>
#include <jsoncpp/json/json.h>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <mastodon-cpp/mastodon-cpp.hpp>
#include <mastodon-cpp/easy/all.hpp>
#include "mastorss.hpp"

using std::cerr;
using std::string;
namespace pt = boost::property_tree;

std::vector<Mastodon::Easy::Status> parse_feed(const string &xml)
{
    Json::Value list;
    std::vector<string> watchwords;

    std::ifstream file(filepath + "watchwords.json");
    if (file.is_open())
    {
        std::stringstream json;
        json << file.rdbuf();
        file.close();
        json >> list;
    }
    else
    {
        cerr << "WARNING: " << filepath << "watchwords.json not found or not readable.\n";
    }

    // Read profile-specific hashtags or fail silently
    const Json::Value &tags_profile = list[profile]["tags"];
    std::transform(tags_profile.begin(), tags_profile.end(),
                   std::back_inserter(watchwords),
                   [](const Json::Value &value)
                   { return value.asString(); });

    // Read global hashtags or fail silently
    const Json::Value &tags_global = list["global"]["tags"];
    std::transform(tags_global.begin(), tags_global.end(),
                   std::back_inserter(watchwords),
                   [](const Json::Value &value)
                   { return value.asString(); });

    pt::ptree rss;
    std::istringstream iss(xml);
    pt::read_xml(iss, rss);
    std::vector<Mastodon::Easy::Status> ret;

    for (const pt::ptree::value_type &chanchild : rss.get_child("rss.channel"))
    {
        if (chanchild.second.size() > 0)
        {
            if (string(chanchild.first.data()).compare("item") == 0)
            {
                string title = chanchild.second.get_child("title").data();
                string link = chanchild.second.get_child("link").data();
                string desc = chanchild.second.get_child("description").data();

                Mastodon::Easy::Status status;
                string content = "";
                if (config[profile]["titles_as_cw"].asBool())
                {
                    status.spoiler_text(Mastodon::unescape_html(title));
                }
                else
                {
                    content = title;
                }
                if (!config[profile]["titles_only"].asBool())
                {
                    if (!content.empty())
                    {
                        content += "\n\n";
                    }
                    content += desc;

                    // Shrink overly long texts, to speed up replace operations
                    if (content.length() > 2000)
                    {
                        content.resize(2000);
                    }
                }

                bool skipthis = false;
                try
                {
                    // Skip entries beginning with this text
                    for (const Json::Value &v : config[profile]["skip"])
                    {
                        const string skip = v.asString();
                        if (!skip.empty())
                        {
                            if (title.compare(0, skip.length(), skip) == 0)
                            {
                                skipthis = true;
                                break;
                            }
                        }
                    }
                }
                catch (const std::exception &e)
                {
                    // Node not found, no problem
                }
                if (skipthis)
                {
                    continue;
                }

                content = Mastodon::unescape_html(content);

                // Try to turn the HTML into human-readable text
                std::regex reparagraph("<p>");
                std::regex recdata1("<!\\[CDATA\\[");
                std::regex recdata2("\\]\\]>");
                std::regex restrip("<[^>]*>");

                individual_fixes(content);

                content = std::regex_replace(content, reparagraph, "\n\n");
                content = std::regex_replace(content, recdata1, "");
                content = std::regex_replace(content, recdata2, "");
                content = std::regex_replace(content, restrip, "");
                // remove \r
                content = std::regex_replace(content, std::regex("\\r"), "");
                // replace NO-BREAK SPACE with space (UTF-8: 0xc2a0)
                content = std::regex_replace(content, std::regex("\u00a0"), " ");
                // remove whitespace between newlines
                content = std::regex_replace(content, std::regex("\\n[ \t]+\\n"), "");
                // remove excess newlines
                content = std::regex_replace(content, std::regex("\\n{3,}"), "\n\n");

                for (const string &hashtag : watchwords)
                {
                    std::regex rehashtag("([[:space:][:punct:]]|^)(" + hashtag
                                         + ")([[:space:][:punct:]]|$)",
                                         std::regex_constants::icase);
                    content = std::regex_replace(content, rehashtag, "$1#$2$3",
                                                 std::regex_constants::format_first_only);
                }
                // Why is this necessary? Why does ##hashtag happen?
                content = std::regex_replace(content, std::regex("##"), "#");

                uint16_t appendix_size = config[profile]["append"].asString().length();
                if ((status.spoiler_text().size() + content.size() + link.size() + appendix_size)
                    > static_cast<std::uint16_t>(max_size - 4))
                {
                    content.resize((max_size - status.spoiler_text().size()
                                    - link.size() - appendix_size - 4));
                    content.resize(content.rfind(' ')); // Cut at word boundary
                    content += " […]";
                }
                // Remove trailing newlines
                while (content.back() == '\n' ||
                       content.back() == '\r')
                {
                    content.resize(content.length() - 1);
                }

                content += "\n\n" + link;

                if (!config[profile]["append"].empty())
                {
                    content += "\n\n" + config[profile]["append"].asString();
                }
                status.content(content);
                ret.push_back(status);
            }
        }
    }

    return ret;
}

// Read regular expressions from the config file and delete all matches.
void individual_fixes(string &str)
{
    for (const Json::Value &v : config[profile]["fixes"])
    {
        std::regex refix(v.asString());
        str = std::regex_replace(str, refix, "");
    }
}
refactoring 2018-02-18 14:27:01 +01:00			`/* This file is part of mastorss.`
Ported to mastoson-cpp 0.105.0. 2019-04-21 04:00:55 +02:00			`* Copyright © 2018, 2019 tastytea <tastytea@tastytea.de>`
refactoring 2018-02-18 14:27:01 +01:00			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, version 3.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include <iostream>`
			`#include <vector>`
			`#include <string>`
			`#include <cstdint>`
			`#include <regex>`
			`#include <sstream>`
Unescaping some more HTML entities 2018-02-22 02:51:58 +01:00			`#include <locale>`
			`#include <codecvt>`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`#include <fstream>`
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`#include <algorithm>`
			`#include <iterator>`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`#include <jsoncpp/json/json.h>`
refactoring 2018-02-18 14:27:01 +01:00			`#include <boost/property_tree/ptree.hpp>`
			`#include <boost/property_tree/xml_parser.hpp>`
updated header location for mastodon-cpp 2018-04-09 18:16:17 +02:00			`#include <mastodon-cpp/mastodon-cpp.hpp>`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`#include <mastodon-cpp/easy/all.hpp>`
refactoring 2018-02-18 14:27:01 +01:00			`#include "mastorss.hpp"`

			`using std::cerr;`
			`using std::string;`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`namespace pt = boost::property_tree;`
refactoring 2018-02-18 14:27:01 +01:00
Added configurable interval between posts to prevent flooding and to allow compliance with newsbots.eu. 2018-09-20 05:06:10 +02:00			`std::vector<Mastodon::Easy::Status> parse_feed(const string &xml)`
refactoring 2018-02-18 14:27:01 +01:00			`{`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`Json::Value list;`
refactoring 2018-02-18 14:27:01 +01:00			`std::vector<string> watchwords;`

replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`std::ifstream file(filepath + "watchwords.json");`
			`if (file.is_open())`
refactoring 2018-02-18 14:27:01 +01:00			`{`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`std::stringstream json;`
			`json << file.rdbuf();`
			`file.close();`
			`json >> list;`
refactoring 2018-02-18 14:27:01 +01:00			`}`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`else`
refactoring 2018-02-18 14:27:01 +01:00			`{`
watchwords.json is no longer required to exist. Fixes #2 2018-08-26 19:46:07 +02:00			`cerr << "WARNING: " << filepath << "watchwords.json not found or not readable.\n";`
refactoring 2018-02-18 14:27:01 +01:00			`}`

replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`// Read profile-specific hashtags or fail silently`
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`const Json::Value &tags_profile = list[profile]["tags"];`
			`std::transform(tags_profile.begin(), tags_profile.end(),`
			`std::back_inserter(watchwords),`
			`[](const Json::Value &value)`
			`{ return value.asString(); });`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00
			`// Read global hashtags or fail silently`
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`const Json::Value &tags_global = list["global"]["tags"];`
			`std::transform(tags_global.begin(), tags_global.end(),`
			`std::back_inserter(watchwords),`
			`[](const Json::Value &value)`
			`{ return value.asString(); });`
refactoring 2018-02-18 14:27:01 +01:00
			`pt::ptree rss;`
			`std::istringstream iss(xml);`
			`pt::read_xml(iss, rss);`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`std::vector<Mastodon::Easy::Status> ret;`
refactoring 2018-02-18 14:27:01 +01:00
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`for (const pt::ptree::value_type &chanchild : rss.get_child("rss.channel"))`
refactoring 2018-02-18 14:27:01 +01:00			`{`
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`if (chanchild.second.size() > 0)`
refactoring 2018-02-18 14:27:01 +01:00			`{`
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`if (string(chanchild.first.data()).compare("item") == 0)`
refactoring 2018-02-18 14:27:01 +01:00			`{`
Replaced for-loops with std::transform, changed a variable name. 2019-04-21 04:14:02 +02:00			`string title = chanchild.second.get_child("title").data();`
			`string link = chanchild.second.get_child("link").data();`
			`string desc = chanchild.second.get_child("description").data();`
Added option to only post titles 2018-04-14 14:10:14 +02:00
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`Mastodon::Easy::Status status;`
			`string content = "";`
			`if (config[profile]["titles_as_cw"].asBool())`
			`{`
Ported to mastoson-cpp 0.105.0. 2019-04-21 04:00:55 +02:00			`status.spoiler_text(Mastodon::unescape_html(title));`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`}`
			`else`
			`{`
			`content = title;`
			`}`
fixed bug with titles_only 2018-04-14 14:35:21 +02:00			`if (!config[profile]["titles_only"].asBool())`
Added option to only post titles 2018-04-14 14:10:14 +02:00			`{`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`if (!content.empty())`
			`{`
			`content += "\n\n";`
			`}`
			`content += desc;`
introduced text limit of 2000 chars 2018-05-07 23:09:21 +02:00
			`// Shrink overly long texts, to speed up replace operations`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`if (content.length() > 2000)`
introduced text limit of 2000 chars 2018-05-07 23:09:21 +02:00			`{`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content.resize(2000);`
introduced text limit of 2000 chars 2018-05-07 23:09:21 +02:00			`}`
Added option to only post titles 2018-04-14 14:10:14 +02:00			`}`
refactoring 2018-02-18 14:27:01 +01:00
fixed faulty article skipping (bug introduced in 0.4.0) 2018-03-15 13:54:53 +01:00			`bool skipthis = false;`
refactoring 2018-03-15 13:20:26 +01:00			`try`
Added special case: ANF News 2018-02-20 23:29:55 +01:00			`{`
refactoring 2018-03-15 13:20:26 +01:00			`// Skip entries beginning with this text`
Added option to only post titles 2018-04-14 14:10:14 +02:00			`for (const Json::Value &v : config[profile]["skip"])`
refactoring 2018-03-15 13:20:26 +01:00			`{`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`const string skip = v.asString();`
refactoring 2018-03-15 13:20:26 +01:00			`if (!skip.empty())`
			`{`
			`if (title.compare(0, skip.length(), skip) == 0)`
			`{`
fixed faulty article skipping (bug introduced in 0.4.0) 2018-03-15 13:54:53 +01:00			`skipthis = true;`
			`break;`
refactoring 2018-03-15 13:20:26 +01:00			`}`
			`}`
			`}`
			`}`
			`catch (const std::exception &e)`
			`{`
			`// Node not found, no problem`
Added special case: ANF News 2018-02-20 23:29:55 +01:00			`}`
fixed faulty article skipping (bug introduced in 0.4.0) 2018-03-15 13:54:53 +01:00			`if (skipthis)`
			`{`
			`continue;`
			`}`
Added special case: ANF News 2018-02-20 23:29:55 +01:00
Ported to mastoson-cpp 0.105.0. 2019-04-21 04:00:55 +02:00			`content = Mastodon::unescape_html(content);`
Unescaping some more HTML entities 2018-02-22 02:51:58 +01:00
refactoring 2018-03-15 13:20:26 +01:00			`// Try to turn the HTML into human-readable text`
turn <p> into \n\n 2018-04-15 12:06:24 +02:00			`std::regex reparagraph("<p>");`
refactoring 2018-02-18 14:27:01 +01:00			`std::regex recdata1("<!\\[CDATA\\[");`
			`std::regex recdata2("\\]\\]>");`
			`std::regex restrip("<[^>]*>");`
Added special case: ANF News 2018-02-20 23:29:55 +01:00
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`individual_fixes(content);`
refactoring 2018-02-18 14:27:01 +01:00
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content = std::regex_replace(content, reparagraph, "\n\n");`
			`content = std::regex_replace(content, recdata1, "");`
			`content = std::regex_replace(content, recdata2, "");`
			`content = std::regex_replace(content, restrip, "");`
			`// remove \r`
			`content = std::regex_replace(content, std::regex("\\r"), "");`
bugfix: NO-BREAK SPACE: confused unicode representation with UTF-8 hex representation 2018-05-02 20:17:55 +02:00			`// replace NO-BREAK SPACE with space (UTF-8: 0xc2a0)`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content = std::regex_replace(content, std::regex("\u00a0"), " ");`
			`// remove whitespace between newlines`
			`content = std::regex_replace(content, std::regex("\\n[ \t]+\\n"), "");`
			`// remove excess newlines`
			`content = std::regex_replace(content, std::regex("\\n{3,}"), "\n\n");`
refactoring 2018-02-18 14:27:01 +01:00
			`for (const string &hashtag : watchwords)`
			`{`
This REALLY fixes bug #1 (I checked the size, but not properly resized the content 🤦) 2018-08-30 01:24:19 +02:00			`std::regex rehashtag("([[:space:][:punct:]]\|^)(" + hashtag`
			`+ ")([[:space:][:punct:]]\|$)",`
			`std::regex_constants::icase);`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content = std::regex_replace(content, rehashtag, "$1#$2$3",`
			`std::regex_constants::format_first_only);`
refactoring 2018-02-18 14:27:01 +01:00			`}`
Fix ##hashtag (ugly) 2018-02-23 00:35:11 +01:00			`// Why is this necessary? Why does ##hashtag happen?`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content = std::regex_replace(content, std::regex("##"), "#");`
bugfix: Too long posts would sometimes not be shrinked 2018-08-25 23:52:22 +02:00
			`uint16_t appendix_size = config[profile]["append"].asString().length();`
Bugfix: Did not count characters in content warnings, resulted in lost messages. fixes #1 2018-08-30 01:07:19 +02:00			`if ((status.spoiler_text().size() + content.size() + link.size() + appendix_size)`
bugfix: Too long posts would sometimes not be shrinked 2018-08-25 23:52:22 +02:00			`> static_cast<std::uint16_t>(max_size - 4))`
refactoring 2018-02-18 14:27:01 +01:00			`{`
This REALLY fixes bug #1 (I checked the size, but not properly resized the content 🤦) 2018-08-30 01:24:19 +02:00			`content.resize((max_size - status.spoiler_text().size()`
			`- link.size() - appendix_size - 4));`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content.resize(content.rfind(' ')); // Cut at word boundary`
			`content += " […]";`
refactoring 2018-02-18 14:27:01 +01:00			`}`
remove trailing newlines in description 2018-04-16 21:05:46 +02:00			`// Remove trailing newlines`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`while (content.back() == '\n' \|\|`
			`content.back() == '\r')`
remove trailing newlines in description 2018-04-16 21:05:46 +02:00			`{`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content.resize(content.length() - 1);`
remove trailing newlines in description 2018-04-16 21:05:46 +02:00			`}`
Made the appended string configurable 2018-05-11 01:17:53 +02:00
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content += "\n\n" + link;`
Made the appended string configurable 2018-05-11 01:17:53 +02:00
			`if (!config[profile]["append"].empty())`
			`{`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`content += "\n\n" + config[profile]["append"].asString();`
Made the appended string configurable 2018-05-11 01:17:53 +02:00			`}`
rewrote parse_website() to return vector of Mastodon::Easy::Status 2018-08-25 14:23:58 +02:00			`status.content(content);`
			`ret.push_back(status);`
refactoring 2018-02-18 14:27:01 +01:00			`}`
			`}`
			`}`

			`return ret;`
			`}`
started to refactor a bit 2018-03-11 15:40:25 +01:00
refactoring 2018-03-15 13:20:26 +01:00			`// Read regular expressions from the config file and delete all matches.`
started to refactor a bit 2018-03-11 15:40:25 +01:00			`void individual_fixes(string &str)`
			`{`
Added option to only post titles 2018-04-14 14:10:14 +02:00			`for (const Json::Value &v : config[profile]["fixes"])`
refactoring 2018-03-15 13:20:26 +01:00			`{`
replaced boost json with jsoncpp 2018-04-14 13:57:03 +02:00			`std::regex refix(v.asString());`
			`str = std::regex_replace(str, refix, "");`
refactoring 2018-03-15 13:20:26 +01:00			`}`
started to refactor a bit 2018-03-11 15:40:25 +01:00			`}`