mastorss/src/parse.cpp

/*  This file is part of mastorss.
 *  Copyright © 2018 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <iostream>
#include <vector>
#include <string>
#include <cstdint>
#include <regex>
#include <sstream>
#include <locale>
#include <codecvt>
#include <fstream>
#include <jsoncpp/json/json.h>
#include <boost/property_tree/ptree.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <mastodon-cpp/mastodon-cpp.hpp>
#include "mastorss.hpp"

using std::cerr;
using std::string;
namespace pt = boost::property_tree;

// Translate &#0123; to chars, translate some named entities to chars
void unescape_html(string &str)
{
    string html = str;
    str = "";
    // Used to convert int to utf-8 char
    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
    std::regex re_entity("&#(x)?(\\d{1,8});");
    std::smatch match;

    while (std::regex_search(html, match, re_entity))
    {
        char32_t codepoint = 0;
        // 'x' in front of the number means it's hexadecimal, else decimal.
        if (match[1].length() == 1)
        {
            codepoint = std::stoi(match[2].str(), nullptr, 16);
        }
        else
        {
            codepoint = std::stoi(match[2].str(), nullptr, 10);
        }
        str += match.prefix().str() + u8c.to_bytes(codepoint);
        html = match.suffix().str();
    }
    str += html;

    std::regex relt("&lt;");
    std::regex regt("&gt;");
    std::regex reamp("&amp;");
    std::regex requot("&quot;");
    std::regex reapos("&apos;");

    str = std::regex_replace(str, relt, "<");
    str = std::regex_replace(str, regt, ">");
    str = std::regex_replace(str, reamp, "&");
    str = std::regex_replace(str, requot, "\"");
    str = std::regex_replace(str, reapos, "\'");
}

std::vector<string> parse_website(const string &xml)
{
    Json::Value list;
    std::vector<string> watchwords;

    std::ifstream file(filepath + "watchwords.json");
    if (file.is_open())
    {
        std::stringstream json;
        json << file.rdbuf();
        file.close();
        json >> list;
    }
    else
    {
        cerr << "ERROR: " << filepath << "watchwords.json not found or not readable.\n";
        return {};
    }

    // Read profile-specific hashtags or fail silently
    for (const Json::Value &value : list[profile]["tags"])
    {
        watchwords.push_back(value.asString());
    }

    // Read global hashtags or fail silently
    for (const Json::Value &value : list["global"]["tags"])
    {
        watchwords.push_back(value.asString());
    }

    pt::ptree rss;
    std::istringstream iss(xml);
    pt::read_xml(iss, rss);
    std::vector<string> ret;

    for (const pt::ptree::value_type &v : rss.get_child("rss.channel"))
    {
        if (v.second.size() > 0)
        {
            if (string(v.first.data()).compare("item") == 0)
            {
                string title = v.second.get_child("title").data();
                string link = v.second.get_child("link").data();
                string desc = v.second.get_child("description").data();

                string str = title;
                if (!config[profile]["titles_only"].asBool())
                {
                    str += "\n\n" + desc;
                }

                bool skipthis = false;
                try
                {
                    // Skip entries beginning with this text
                    for (const Json::Value &v : config[profile]["skip"])
                    {
                        const string skip = v.asString();
                        if (!skip.empty())
                        {
                            if (title.compare(0, skip.length(), skip) == 0)
                            {
                                skipthis = true;
                                break;
                            }
                        }
                    }
                }
                catch (const std::exception &e)
                {
                    // Node not found, no problem
                }
                if (skipthis)
                {
                    continue;
                }

                unescape_html(str);

                // Try to turn the HTML into human-readable text
                std::regex reparagraph("<p>");
                std::regex recdata1("<!\\[CDATA\\[");
                std::regex recdata2("\\]\\]>");
                std::regex restrip("<[^>]*>");

                individual_fixes(str);

                str = std::regex_replace(str, reparagraph, "\n\n");
                str = std::regex_replace(str, recdata1, "");
                str = std::regex_replace(str, recdata2, "");
                str = std::regex_replace(str, restrip, "");
                str = std::regex_replace(str, std::regex("\\r"), "");           // remove \r
                str = std::regex_replace(str, std::regex("\uc2a0"), " ");       // replace NO-BREAK SPACE with space
                str = std::regex_replace(str, std::regex("\\n[ \t]+\\n"), "");  // remove whitespace between newlines
                str = std::regex_replace(str, std::regex("\\n{3,}"), "\n\n");   // remove excess newlines

                for (const string &hashtag : watchwords)
                {
                    std::regex rehashtag("([[:space:][:punct:]]|^)(" + hashtag + ")([[:space:][:punct:]]|$)",
                                         std::regex_constants::icase);
                    str = std::regex_replace(str, rehashtag, "$1#$2$3",
                                             std::regex_constants::format_first_only);
                }
                // Why is this necessary? Why does ##hashtag happen?
                str = std::regex_replace(str, std::regex("##"), "#");
                if ((str.size() + link.size()) > static_cast<std::uint16_t>(max_size - 15))
                {
                    str.resize((max_size - link.size() - 15));
                    str.resize(str.rfind(' ')); // Cut at word boundary
                    str += " […]";
                }
                // Remove trailing newlines
                while (str.back() == '\n' ||
                       str.back() == '\r')
                {
                    str.resize(str.length() - 1);
                }
                str += "\n\n" + link + "\n\n#bot";
                ret.push_back(str);
            }
        }
    }

    return ret;
}

// Read regular expressions from the config file and delete all matches.
void individual_fixes(string &str)
{
    for (const Json::Value &v : config[profile]["fixes"])
    {
        std::regex refix(v.asString());
        str = std::regex_replace(str, refix, "");
    }
}