mastorss/src/document.cpp

/*  This file is part of mastorss.
 *  Copyright © 2019-2021 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "document.hpp"

#include "curl_wrapper.hpp"
#include "exceptions.hpp"
#include "version.hpp"

#include <boost/log/trivial.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <boost/regex.hpp>
#include <json/json.h>
#include <mastodonpp/mastodonpp.hpp>

#include <algorithm>
#include <fstream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <utility>

namespace mastorss
{
using boost::regex;
using boost::regex_replace;
using std::any_of;
using std::ifstream;
using std::istringstream;
using std::move;
using std::string;
using std::stringstream;
using std::transform;

bool operator!=(const Item &a, const Item &b)
{
    return a.guid != b.guid;
}

Document::Document(Config &cfg)
    : _cfg{cfg}
    , _profiledata{_cfg.profiledata}
{
    download();
}

void Document::download(const string &uri, const bool temp_redirect)
{
    namespace cw = curl_wrapper;

    BOOST_LOG_TRIVIAL(debug) << "Downloading <" << uri << "> …";
    cw::CURLWrapper curl;
    curl.set_useragent(string("mastorss/") += version);
    curl.set_maxredirs(0);

    const auto answer{curl.make_http_request(cw::http_method::GET, uri)};

    BOOST_LOG_TRIVIAL(debug) << "Got response: " << answer.status;
    BOOST_LOG_TRIVIAL(debug) << "Got Headers:";
    BOOST_LOG_TRIVIAL(debug) << answer.headers;

    switch (answer.status)
    {
    case 200:
    {
        _raw_doc = answer.body;
        BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _profiledata.feedurl;
        break;
    }
    case 301:
    case 308:
    {
        if (temp_redirect)
        {
            goto temporary_redirect; // NOLINT(cppcoreguidelines-avoid-goto)
        }
        _profiledata.feedurl = extract_location(answer);
        if (_profiledata.feedurl.empty())
        {
            throw HTTPException{answer.status};
        }

        // clang-format off
        BOOST_LOG_TRIVIAL(debug) << "Feed has new location (permanent): "
                                 << _profiledata.feedurl;
        // clang-format on
        _cfg.write();
        download();
        break;
    }
    case 302:
    case 303:
    case 307:
    {
temporary_redirect:
        const string newuri{extract_location(answer)};
        if (newuri.empty())
        {
            throw HTTPException{answer.status};
        }

        // clang-format off
        BOOST_LOG_TRIVIAL(debug) << "Feed has new location (temporary): "
                                 << newuri;
        // clang-format on
        download(newuri, true);
        break;
    }
    default:
    {
        throw HTTPException{answer.status};
    }
    }
}

void Document::download()
{
    download(_profiledata.feedurl);
}

void Document::parse()
{
    if (_profiledata.add_hashtags)
    {
        parse_watchwords();
    }
    pt::ptree tree;
    istringstream iss{_raw_doc};
    pt::read_xml(iss, tree);

    if (tree.front().first == "rss")
    {
        BOOST_LOG_TRIVIAL(debug) << "RSS detected.";
        parse_rss(tree);
    }
    else
    {
        throw ParseException{"Could not detect type of feed."};
    }
}

void Document::parse_rss(const pt::ptree &tree)
{
    size_t counter{0};
    for (const auto &child : tree.get_child("rss.channel"))
    {
        if (counter == Config::max_guids)
        {
            BOOST_LOG_TRIVIAL(debug)
                << "Maximum number of items reached. Stopped parsing.";
            break;
        }
        ++counter;
        if (child.first == "item")
        {
            const auto &rssitem = child.second;

            string guid{rssitem.get<string>("guid", "")};
            if (guid.empty()) // We hope either <guid> or <link> are present.
            {
                guid = rssitem.get<string>("link");
            }
            if (any_of(_profiledata.guids.begin(), _profiledata.guids.end(),
                       [&](const auto &old_guid) { return guid == old_guid; }))
            {
                // clang-format off
                BOOST_LOG_TRIVIAL(debug) << "Found already posted GUID: "
                                         << guid;
                // clang-format on
                if (_profiledata.keep_looking)
                {
                    continue;
                }

                BOOST_LOG_TRIVIAL(debug) << "Stopped parsing.";
                break;
            }

            string title{rssitem.get<string>("title")};
            if (any_of(_profiledata.skip.begin(), _profiledata.skip.end(),
                       [&title](const string &skip)
                       // clang-format off
                       { return title.substr(0, skip.size()) == skip; }))
            // clang-format on
            {
                BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;
                continue;
            }

            Item item;
            item.description = [&]
            // clang-format off
            {
                string desc{rssitem.get<string>("description")};
                for (const auto &fix : _profiledata.fixes)
                {
                    desc = regex_replace(desc, regex{fix}, "");
                }
                desc = remove_html(desc);
                if (_profiledata.add_hashtags)
                {
                    desc = add_hashtags(desc);
                }
                return desc;
            }();
            // clang-format on
            item.guid = move(guid);
            item.link = rssitem.get<string>("link");
            item.title = mastodonpp::unescape_html(title);
            new_items.push_front(item);

            BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;

            if (_profiledata.guids.empty() && !_profiledata.keep_looking)
            {
                BOOST_LOG_TRIVIAL(debug) << "This is the first run.";
                break;
            }
        }
    }
}

string Document::remove_html(string html)
{
    html = mastodonpp::unescape_html(html); // Decode HTML entities.

    html = regex_replace(html, regex{"<p>"}, "\n\n");
    html = regex_replace(html, regex{"<br>"}, "\n");

    const list re_list{regex{R"(<!\[CDATA\[)"},      // CDATA beginning.
                       regex{R"(\]\]>)"},            // CDATA end.
                       regex{"<[^>]+>"},             // HTML tags.
                       regex{R"(\r)"},               // Carriage return.
                       regex{"\\n[ \\t\u00a0]+\\n"}, // Space between newlines.
                       regex{R"(^\n+)"}};            // Newlines at beginning.
    for (const regex &re : re_list)
    {
        html = regex_replace(html, re, "");
    }

    // Remove excess newlines.
    html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");
    // Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).
    html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");

    BOOST_LOG_TRIVIAL(debug) << "Converted HTML to text.";

    return html;
}

string Document::extract_location(const curl_wrapper::answer &answer)
{
    string location{answer.get_header("Location")};

    if (location.empty())
    {
        throw std::runtime_error{"Could not extract new feed location."};
    }

    return location;
}

string Document::add_hashtags(const string &text)
{
    string out{text};
    for (const auto &tag : _watchwords)
    {
        regex re_tag("([[:space:]\u200b]|^)(" + tag
                         + ")([[:space:]\u200b[:punct:]]|$)",
                     regex::icase);
        out = regex_replace(out, re_tag, "$1#$2$3", boost::format_first_only);
    }

    return out;
}

void Document::parse_watchwords()
{
    Json::Value json;
    const auto filepath = _cfg.get_config_dir() /= "watchwords.json";
    ifstream file(filepath.c_str());
    if (file.good())
    {
        stringstream rawjson;
        rawjson << file.rdbuf();
        rawjson >> json;
        BOOST_LOG_TRIVIAL(debug) << "Read " << filepath;
    }
    else
    {
        BOOST_LOG_TRIVIAL(warning)
            << "File Not found: "
            << (_cfg.get_config_dir() /= "watchwords.json").string();
        return;
    }

    const auto &tags_profile = json[_cfg.profile]["tags"];
    const auto &tags_global = json["global"]["tags"];
    transform(tags_profile.begin(), tags_profile.end(),
              std::back_inserter(_watchwords),
              [](const Json::Value &value) { return value.asString(); });
    transform(tags_global.begin(), tags_global.end(),
              std::back_inserter(_watchwords),
              [](const Json::Value &value) { return value.asString(); });
}

} // namespace mastorss
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`/* This file is part of mastorss.`
Replace <br> with newline. 2021-07-12 23:35:42 +02:00			`* Copyright © 2019-2021 tastytea <tastytea@tastytea.de>`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, version 3.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include "document.hpp"`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`#include "curl_wrapper.hpp"`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`#include "exceptions.hpp"`
			`#include "version.hpp"`

Add debug logging. 2019-12-24 18:53:45 +01:00			`#include <boost/log/trivial.hpp>`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`#include <boost/property_tree/xml_parser.hpp>`
			`#include <boost/regex.hpp>`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`#include <json/json.h>`
Switch from mastodon-cpp to mastodonpp. 2020-01-15 19:05:49 +01:00			`#include <mastodonpp/mastodonpp.hpp>`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`#include <algorithm>`
			`#include <fstream>`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`#include <sstream>`
Make Document::extract_location() more robust. 2020-08-24 18:14:12 +02:00			`#include <stdexcept>`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`#include <string>`
			`#include <utility>`

Add debug logging. 2019-12-24 18:53:45 +01:00			`namespace mastorss`
			`{`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`using boost::regex;`
			`using boost::regex_replace;`
Replace for-loop with std::any_of. 2019-12-29 04:00:48 +01:00			`using std::any_of;`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`using std::ifstream;`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`using std::istringstream;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`using std::move;`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`using std::string;`
			`using std::stringstream;`
			`using std::transform;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`bool operator!=(const Item &a, const Item &b)`
Add operator != to Item. To be able to determine if the last item is visited in main(). 2019-12-28 07:12:28 +01:00			`{`
			`return a.guid != b.guid;`
			`}`

Pass whole Config object to Document. 2019-12-25 06:26:20 +01:00			`Document::Document(Config &cfg)`
			`: _cfg{cfg}`
More renaming. 2019-12-28 09:10:32 +01:00			`, _profiledata{_cfg.profiledata}`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`{`
			`download();`
			`}`

Treat permanent redirects after temporary redirects as temporary. Example: http://example.com/ returns 302 with Location: http://example.com/1 http://example.com/1 returns 301 with Location: http://example.com/2 http://example.com/ will not be overwritten in the config file. 2019-12-29 00:41:14 +01:00			`void Document::download(const string &uri, const bool temp_redirect)`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`{`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`namespace cw = curl_wrapper;`

Add extra debug logging. 2020-08-24 16:52:20 +02:00			`BOOST_LOG_TRIVIAL(debug) << "Downloading <" << uri << "> …";`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`cw::CURLWrapper curl;`
			`curl.set_useragent(string("mastorss/") += version);`
			`curl.set_maxredirs(0);`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`const auto answer{curl.make_http_request(cw::http_method::GET, uri)};`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Got response: " << answer.status;`
Add extra debug logging. 2020-08-24 16:52:20 +02:00			`BOOST_LOG_TRIVIAL(debug) << "Got Headers:";`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`BOOST_LOG_TRIVIAL(debug) << answer.headers;`
Add extra debug logging. 2020-08-24 16:52:20 +02:00
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`switch (answer.status)`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`{`
			`case 200:`
			`{`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`_raw_doc = answer.body;`
More renaming. 2019-12-28 09:10:32 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _profiledata.feedurl;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`break;`
			`}`
			`case 301:`
			`case 308:`
			`{`
Treat permanent redirects after temporary redirects as temporary. Example: http://example.com/ returns 302 with Location: http://example.com/1 http://example.com/1 returns 301 with Location: http://example.com/2 http://example.com/ will not be overwritten in the config file. 2019-12-29 00:41:14 +01:00			`if (temp_redirect)`
			`{`
Shut up clang-tidy about goto. 2019-12-29 08:33:06 +01:00			`goto temporary_redirect; // NOLINT(cppcoreguidelines-avoid-goto)`
Treat permanent redirects after temporary redirects as temporary. Example: http://example.com/ returns 302 with Location: http://example.com/1 http://example.com/1 returns 301 with Location: http://example.com/2 http://example.com/ will not be overwritten in the config file. 2019-12-29 00:41:14 +01:00			`}`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`_profiledata.feedurl = extract_location(answer);`
More renaming. 2019-12-28 09:10:32 +01:00			`if (_profiledata.feedurl.empty())`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`{`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`throw HTTPException{answer.status};`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`}`

Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format off`
Clarify redirect log messages. 2019-12-25 19:31:07 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Feed has new location (permanent): "`
More renaming. 2019-12-28 09:10:32 +01:00			`<< _profiledata.feedurl;`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format on`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`_cfg.write();`
			`download();`
			`break;`
			`}`
			`case 302:`
			`case 303:`
			`case 307:`
			`{`
Add extra debug logging. 2020-08-24 16:52:20 +02:00			`temporary_redirect:`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`const string newuri{extract_location(answer)};`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`if (newuri.empty())`
			`{`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`throw HTTPException{answer.status};`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`}`

Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format off`
Clarify redirect log messages. 2019-12-25 19:31:07 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Feed has new location (temporary): "`
Log right URI for temporaray redirects. 2019-12-29 00:49:23 +01:00			`<< newuri;`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format on`
Treat permanent redirects after temporary redirects as temporary. Example: http://example.com/ returns 302 with Location: http://example.com/1 http://example.com/1 returns 301 with Location: http://example.com/2 http://example.com/ will not be overwritten in the config file. 2019-12-29 00:41:14 +01:00			`download(newuri, true);`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`break;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`}`
			`default:`
			`{`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`throw HTTPException{answer.status};`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`}`
			`}`
			`}`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`void Document::download()`
			`{`
More renaming. 2019-12-28 09:10:32 +01:00			`download(_profiledata.feedurl);`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`}`

Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`void Document::parse()`
			`{`
Don't try to parse watchwords if add_hashtags is false. 2020-11-21 20:09:35 +01:00			`if (_profiledata.add_hashtags)`
			`{`
			`parse_watchwords();`
			`}`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`pt::ptree tree;`
			`istringstream iss{_raw_doc};`
			`pt::read_xml(iss, tree);`

			`if (tree.front().first == "rss")`
			`{`
Add a bit of logging. 2019-12-25 02:42:47 +01:00			`BOOST_LOG_TRIVIAL(debug) << "RSS detected.";`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`parse_rss(tree);`
			`}`
Actually throw ParseException. 🤦 2020-01-01 12:53:11 +01:00			`else`
			`{`
			`throw ParseException{"Could not detect type of feed."};`
			`}`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`}`

			`void Document::parse_rss(const pt::ptree &tree)`
			`{`
Don't parse more items than Config::max_guids. 2020-11-05 14:14:33 +01:00			`size_t counter{0};`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`for (const auto &child : tree.get_child("rss.channel"))`
			`{`
Don't parse more items than Config::max_guids. 2020-11-05 14:14:33 +01:00			`if (counter == Config::max_guids)`
			`{`
			`BOOST_LOG_TRIVIAL(debug)`
			`<< "Maximum number of items reached. Stopped parsing.";`
			`break;`
			`}`
			`++counter;`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`if (child.first == "item")`
			`{`
			`const auto &rssitem = child.second;`

Allow GUID to be empty. 2019-12-28 23:33:28 +01:00			`string guid{rssitem.get<string>("guid", "")};`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`if (guid.empty()) // We hope either <guid> or <link> are present.`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`{`
			`guid = rssitem.get<string>("link");`
			`}`
Check for already posted guids in ProfileData::guids. 2020-01-01 12:57:50 +01:00			`if (any_of(_profiledata.guids.begin(), _profiledata.guids.end(),`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`[&](const auto &old_guid) { return guid == old_guid; }))`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`{`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format off`
Add guid of already posted items to debug output. 2020-01-01 13:44:54 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Found already posted GUID: "`
			`<< guid;`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format on`
Add config option keep_looking. If set, don't stop at first already posted guid. 2020-01-01 13:25:05 +01:00			`if (_profiledata.keep_looking)`
			`{`
			`continue;`
			`}`

			`BOOST_LOG_TRIVIAL(debug) << "Stopped parsing.";`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`break;`
			`}`

Fix some initializations. 2019-12-28 06:27:56 +01:00			`string title{rssitem.get<string>("title")};`
Replace for-loop with std::any_of. 2019-12-29 04:00:48 +01:00			`if (any_of(_profiledata.skip.begin(), _profiledata.skip.end(),`
			`[&title](const string &skip)`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format off`
Replace for-loop with std::any_of. 2019-12-29 04:00:48 +01:00			`{ return title.substr(0, skip.size()) == skip; }))`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format on`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`{`
			`BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;`
			`continue;`
			`}`

			`Item item;`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`item.description = [&]`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format off`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`{`
Apply individual fixes before removing HTML. 2019-12-28 23:31:10 +01:00			`string desc{rssitem.get<string>("description")};`
More renaming. 2019-12-28 09:10:32 +01:00			`for (const auto &fix : _profiledata.fixes)`
Apply user-fixes. 2019-12-25 02:42:27 +01:00			`{`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`desc = regex_replace(desc, regex{fix}, "");`
			`}`
Apply individual fixes before removing HTML. 2019-12-28 23:31:10 +01:00			`desc = remove_html(desc);`
Make hashtag-replacement optional. 2020-10-29 15:06:51 +01:00			`if (_profiledata.add_hashtags)`
			`{`
			`desc = add_hashtags(desc);`
			`}`
			`return desc;`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`}();`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`// clang-format on`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`item.guid = move(guid);`
			`item.link = rssitem.get<string>("link");`
Unescape HTML in titles. 2020-06-19 22:20:03 +02:00			`item.title = mastodonpp::unescape_html(title);`
Begin with the oldest item. 2019-12-28 06:48:35 +01:00			`new_items.push_front(item);`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00
			`BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;`
Only add one new item if it is the first run. 2019-12-28 06:54:05 +01:00
Don't stop after first post if keep_looking = true. 2020-11-21 23:53:47 +01:00			`if (_profiledata.guids.empty() && !_profiledata.keep_looking)`
Only add one new item if it is the first run. 2019-12-28 06:54:05 +01:00			`{`
Add log message for first run. 2019-12-28 07:16:15 +01:00			`BOOST_LOG_TRIVIAL(debug) << "This is the first run.";`
Only add one new item if it is the first run. 2019-12-28 06:54:05 +01:00			`break;`
			`}`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`}`
			`}`
			`}`

Make Document::remove_html static. 2021-07-12 23:39:00 +02:00			`string Document::remove_html(string html)`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`{`
Switch from mastodon-cpp to mastodonpp. 2020-01-15 19:05:49 +01:00			`html = mastodonpp::unescape_html(html); // Decode HTML entities.`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00
			`html = regex_replace(html, regex{"<p>"}, "\n\n");`
Replace <br> with newline. 2021-07-12 23:35:42 +02:00			`html = regex_replace(html, regex{"<br>"}, "\n");`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`const list re_list{regex{R"(<!\[CDATA\[)"}, // CDATA beginning.`
			`regex{R"(\]\]>)"}, // CDATA end.`
			`regex{"<[^>]+>"}, // HTML tags.`
			`regex{R"(\r)"}, // Carriage return.`
			`regex{"\\n[ \\t\u00a0]+\\n"}, // Space between newlines.`
			`regex{R"(^\n+)"}}; // Newlines at beginning.`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`for (const regex &re : re_list)`
			`{`
			`html = regex_replace(html, re, "");`
			`}`

			`// Remove excess newlines.`
			`html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");`
			`// Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).`
			`html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");`

Add a bit of logging. 2019-12-25 02:42:47 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Converted HTML to text.";`

Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`return html;`
			`}`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`string Document::extract_location(const curl_wrapper::answer &answer)`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`{`
Fix variable initialization in extract_location(). 2020-11-21 22:43:41 +01:00			`string location{answer.get_header("Location")};`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00
			`if (location.empty())`
Make Document::extract_location() more robust. 2020-08-24 18:14:12 +02:00			`{`
Get rid of restclient-cpp dependency. Use curl_wrapper instead. 2020-11-21 22:30:24 +01:00			`throw std::runtime_error{"Could not extract new feed location."};`
Make Document::extract_location() more robust. 2020-08-24 18:14:12 +02:00			`}`

Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`return location;`
			`}`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00
			`string Document::add_hashtags(const string &text)`
Parse watchwords only once. 2019-12-29 00:05:13 +01:00			`{`
			`string out{text};`
			`for (const auto &tag : _watchwords)`
			`{`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`regex re_tag("([[:space:]\u200b]\|^)(" + tag`
			`+ ")([[:space:]\u200b[:punct:]]\|$)",`
			`regex::icase);`
Parse watchwords only once. 2019-12-29 00:05:13 +01:00			`out = regex_replace(out, re_tag, "$1#$2$3", boost::format_first_only);`
			`}`

			`return out;`
			`}`

			`void Document::parse_watchwords()`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`{`
			`Json::Value json;`
Log path of watchwords.json after reading. 2019-12-28 09:58:49 +01:00			`const auto filepath = _cfg.get_config_dir() /= "watchwords.json";`
Fix ifstream initialization. 2020-11-21 23:21:02 +01:00			`ifstream file(filepath.c_str());`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`if (file.good())`
			`{`
			`stringstream rawjson;`
			`rawjson << file.rdbuf();`
			`rawjson >> json;`
Log path of watchwords.json after reading. 2019-12-28 09:58:49 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Read " << filepath;`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`}`
			`else`
			`{`
Reformatting with clang-format. 2020-06-19 23:04:42 +02:00			`BOOST_LOG_TRIVIAL(warning)`
			`<< "File Not found: "`
			`<< (_cfg.get_config_dir() /= "watchwords.json").string();`
Don't throw error if watchwords.json does not exist. 2020-01-03 03:26:00 +01:00			`return;`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`}`

			`const auto &tags_profile = json[_cfg.profile]["tags"];`
			`const auto &tags_global = json["global"]["tags"];`
			`transform(tags_profile.begin(), tags_profile.end(),`
Parse watchwords only once. 2019-12-29 00:05:13 +01:00			`std::back_inserter(_watchwords),`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`[](const Json::Value &value) { return value.asString(); });`
			`transform(tags_global.begin(), tags_global.end(),`
Parse watchwords only once. 2019-12-29 00:05:13 +01:00			`std::back_inserter(_watchwords),`
Add watchwords (hashtags) support. 2019-12-28 09:48:42 +01:00			`[](const Json::Value &value) { return value.asString(); });`
			`}`

Add debug logging. 2019-12-24 18:53:45 +01:00			`} // namespace mastorss`