mastorss/src/document.cpp

/*  This file is part of mastorss.
 *  Copyright © 2019 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "document.hpp"
#include "exceptions.hpp"
#include "version.hpp"

#include <boost/log/trivial.hpp>
#include <boost/property_tree/xml_parser.hpp>
#include <boost/regex.hpp>
#include <mastodon-cpp/mastodon-cpp.hpp>
#include <restclient-cpp/connection.h>

#include <list>
#include <sstream>
#include <string>
#include <utility>

namespace mastorss
{
using boost::regex;
using boost::regex_replace;
using std::list;
using std::istringstream;
using std::string;
using std::move;

bool operator !=(const Item &a, const Item &b)
{
    return a.guid != b.guid;
}

Document::Document(Config &cfg)
    : _cfg{cfg}
    , profiledata{cfg.data}
{
    RestClient::init();

    download();
}

Document::~Document()
{
    RestClient::disable();
}

void Document::download(const string &uri)
{
    RestClient::Connection connection{uri};
    connection.SetUserAgent(string("mastorss/").append(version));
    connection.FollowRedirects(false);

    RestClient::Response response{connection.get("")};

    switch (response.code)
    {
    case 200:
    {
        _raw_doc = response.body;
        BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << profiledata.feedurl;
        break;
    }
    case 301:
    case 308:
    {
        profiledata.feedurl = extract_location(response.headers);
        if (profiledata.feedurl.empty())
        {
            throw HTTPException{response.code};
        }

        BOOST_LOG_TRIVIAL(debug) << "Feed has new location (permanent): "
                                 << profiledata.feedurl;
        _cfg.write();
        download();
        break;
    }
    case 302:
    case 303:
    case 307:
    {
        const string newuri{extract_location(response.headers)};
        if (newuri.empty())
        {
            throw HTTPException{response.code};
        }

        BOOST_LOG_TRIVIAL(debug) << "Feed has new location (temporary): "
                                 << profiledata.feedurl;
        download(newuri);
        break;
    }
    case -1:
    {
        throw CURLException{errno};
    }
    default:
    {
        throw HTTPException{response.code};
    }
    }
}

void Document::download()
{
    download(profiledata.feedurl);
}

void Document::parse()
{
    pt::ptree tree;
    istringstream iss{_raw_doc};
    pt::read_xml(iss, tree);

    if (tree.front().first == "rss")
    {
        BOOST_LOG_TRIVIAL(debug) << "RSS detected.";
        parse_rss(tree);
    }
}

void Document::parse_rss(const pt::ptree &tree)
{
    for (const auto &child : tree.get_child("rss.channel"))
    {
        if (child.first == "item")
        {
            const auto &rssitem = child.second;

            string guid{rssitem.get<string>("guid")};
            if (guid.empty())   // We hope either <guid> or <link> are present.
            {
                guid = rssitem.get<string>("link");
            }
            if (guid == profiledata.last_guid)
            {
                break;
            }

            bool skipthis{false};
            string title{rssitem.get<string>("title")};
            for (const auto &skip : profiledata.skip)
            {
                if (title.substr(0, skip.length()) == skip)
                {
                    skipthis = true;
                    break;
                }
            }
            if (skipthis)
            {
                BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;
                continue;
            }

            Item item;
            item.description = [&]
            {
                string desc
                    {remove_html(rssitem.get<string>("description"))};
                for (const auto &fix : profiledata.fixes)
                {
                    desc = regex_replace(desc, regex{fix}, "");
                }
                return desc;
            }();
            item.guid = move(guid);
            item.link = rssitem.get<string>("link");
            item.title = move(title);
            new_items.push_front(item);

            BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;

            if (profiledata.last_guid.empty())
            {
                BOOST_LOG_TRIVIAL(debug) << "This is the first run.";
                break;
            }
        }
    }
}

string Document::remove_html(string html) const
{
    html = Mastodon::unescape_html(html); // Decode HTML entities.

    html = regex_replace(html, regex{"<p>"}, "\n\n");

    const list re_list
        {
            regex{R"(<!\[CDATA\[)"},      // CDATA beginning.
            regex{R"(\]\]>)"},            // CDATA end.
            regex{"<[^>]+>"},             // HTML tags.
            regex{R"(\r)"},               // Carriage return.
            regex{"\\n[ \\t\u00a0]+\\n"}, // Whitespace between newlines.
            regex{R"(^\n+)"}              // Newlines at the beginning.
        };
    for (const regex &re : re_list)
    {
        html = regex_replace(html, re, "");
    }

    // Remove excess newlines.
    html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");
    // Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).
    html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");

    BOOST_LOG_TRIVIAL(debug) << "Converted HTML to text.";

    return html;
}

string Document::extract_location(const RestClient::HeaderFields &headers) const
{
    string location{headers.at("Location")};
    if (location.empty())
    {
        location = headers.at("location");
    }
    return location;
}
} // namespace mastorss
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`/* This file is part of mastorss.`
			`* Copyright © 2019 tastytea <tastytea@tastytea.de>`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, version 3.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include "document.hpp"`
			`#include "exceptions.hpp"`
			`#include "version.hpp"`

Add debug logging. 2019-12-24 18:53:45 +01:00			`#include <boost/log/trivial.hpp>`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`#include <boost/property_tree/xml_parser.hpp>`
			`#include <boost/regex.hpp>`
			`#include <mastodon-cpp/mastodon-cpp.hpp>`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`#include <restclient-cpp/connection.h>`

Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`#include <list>`
			`#include <sstream>`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`#include <string>`
			`#include <utility>`

Add debug logging. 2019-12-24 18:53:45 +01:00			`namespace mastorss`
			`{`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`using boost::regex;`
			`using boost::regex_replace;`
			`using std::list;`
			`using std::istringstream;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`using std::string;`
			`using std::move;`

Add operator != to Item. To be able to determine if the last item is visited in main(). 2019-12-28 07:12:28 +01:00			`bool operator !=(const Item &a, const Item &b)`
			`{`
			`return a.guid != b.guid;`
			`}`

Pass whole Config object to Document. 2019-12-25 06:26:20 +01:00			`Document::Document(Config &cfg)`
			`: _cfg{cfg}`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`, profiledata{cfg.data}`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`{`
			`RestClient::init();`

			`download();`
			`}`

			`Document::~Document()`
			`{`
			`RestClient::disable();`
			`}`

Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`void Document::download(const string &uri)`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`{`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`RestClient::Connection connection{uri};`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`connection.SetUserAgent(string("mastorss/").append(version));`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`connection.FollowRedirects(false);`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00
			`RestClient::Response response{connection.get("")};`

			`switch (response.code)`
			`{`
			`case 200:`
			`{`
			`_raw_doc = response.body;`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << profiledata.feedurl;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`break;`
			`}`
			`case 301:`
			`case 308:`
			`{`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`profiledata.feedurl = extract_location(response.headers);`
			`if (profiledata.feedurl.empty())`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`{`
			`throw HTTPException{response.code};`
			`}`

Clarify redirect log messages. 2019-12-25 19:31:07 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Feed has new location (permanent): "`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`<< profiledata.feedurl;`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`_cfg.write();`
			`download();`
			`break;`
			`}`
			`case 302:`
			`case 303:`
			`case 307:`
			`{`
Remove useless move. 2019-12-25 19:28:48 +01:00			`const string newuri{extract_location(response.headers)};`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`if (newuri.empty())`
			`{`
			`throw HTTPException{response.code};`
			`}`

Clarify redirect log messages. 2019-12-25 19:31:07 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Feed has new location (temporary): "`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`<< profiledata.feedurl;`
Remove useless move. 2019-12-25 19:28:48 +01:00			`download(newuri);`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`break;`
Print --help, --version and implement downloading. 2019-12-20 01:14:23 +01:00			`}`
			`case -1:`
			`{`
			`throw CURLException{errno};`
			`}`
			`default:`
			`{`
			`throw HTTPException{response.code};`
			`}`
			`}`
			`}`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`void Document::download()`
			`{`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`download(profiledata.feedurl);`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00			`}`

Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`void Document::parse()`
			`{`
			`pt::ptree tree;`
			`istringstream iss{_raw_doc};`
			`pt::read_xml(iss, tree);`

			`if (tree.front().first == "rss")`
			`{`
Add a bit of logging. 2019-12-25 02:42:47 +01:00			`BOOST_LOG_TRIVIAL(debug) << "RSS detected.";`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`parse_rss(tree);`
			`}`
			`}`

			`void Document::parse_rss(const pt::ptree &tree)`
			`{`
			`for (const auto &child : tree.get_child("rss.channel"))`
			`{`
			`if (child.first == "item")`
			`{`
			`const auto &rssitem = child.second;`

			`string guid{rssitem.get<string>("guid")};`
			`if (guid.empty()) // We hope either <guid> or <link> are present.`
			`{`
			`guid = rssitem.get<string>("link");`
			`}`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`if (guid == profiledata.last_guid)`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`{`
			`break;`
			`}`

			`bool skipthis{false};`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`string title{rssitem.get<string>("title")};`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`for (const auto &skip : profiledata.skip)`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`{`
			`if (title.substr(0, skip.length()) == skip)`
			`{`
			`skipthis = true;`
			`break;`
			`}`
			`}`
			`if (skipthis)`
			`{`
			`BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid;`
			`continue;`
			`}`

			`Item item;`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`item.description = [&]`
			`{`
			`string desc`
			`{remove_html(rssitem.get<string>("description"))};`
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`for (const auto &fix : profiledata.fixes)`
Apply user-fixes. 2019-12-25 02:42:27 +01:00			`{`
Fix some initializations. 2019-12-28 06:27:56 +01:00			`desc = regex_replace(desc, regex{fix}, "");`
			`}`
			`return desc;`
			`}();`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`item.guid = move(guid);`
			`item.link = rssitem.get<string>("link");`
			`item.title = move(title);`
Begin with the oldest item. 2019-12-28 06:48:35 +01:00			`new_items.push_front(item);`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00
			`BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid;`
Only add one new item if it is the first run. 2019-12-28 06:54:05 +01:00
Rename profile → profiledata. 2019-12-28 09:01:17 +01:00			`if (profiledata.last_guid.empty())`
Only add one new item if it is the first run. 2019-12-28 06:54:05 +01:00			`{`
Add log message for first run. 2019-12-28 07:16:15 +01:00			`BOOST_LOG_TRIVIAL(debug) << "This is the first run.";`
Only add one new item if it is the first run. 2019-12-28 06:54:05 +01:00			`break;`
			`}`
Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`}`
			`}`
			`}`

			`string Document::remove_html(string html) const`
			`{`
			`html = Mastodon::unescape_html(html); // Decode HTML entities.`

			`html = regex_replace(html, regex{"<p>"}, "\n\n");`

			`const list re_list`
			`{`
			`regex{R"(<!\[CDATA\[)"}, // CDATA beginning.`
			`regex{R"(\]\]>)"}, // CDATA end.`
			`regex{"<[^>]+>"}, // HTML tags.`
			`regex{R"(\r)"}, // Carriage return.`
			`regex{"\\n[ \\t\u00a0]+\\n"}, // Whitespace between newlines.`
			`regex{R"(^\n+)"} // Newlines at the beginning.`
			`};`
			`for (const regex &re : re_list)`
			`{`
			`html = regex_replace(html, re, "");`
			`}`

			`// Remove excess newlines.`
			`html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n");`
			`// Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead).`
			`html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " ");`

Add a bit of logging. 2019-12-25 02:42:47 +01:00			`BOOST_LOG_TRIVIAL(debug) << "Converted HTML to text.";`

Extract and parse <item>s from RSS feeds. 2019-12-25 02:08:53 +01:00			`return html;`
			`}`
Handle HTTP redirects. 2019-12-25 06:28:31 +01:00
			`string Document::extract_location(const RestClient::HeaderFields &headers) const`
			`{`
			`string location{headers.at("Location")};`
			`if (location.empty())`
			`{`
			`location = headers.at("location");`
			`}`
			`return location;`
			`}`
Add debug logging. 2019-12-24 18:53:45 +01:00			`} // namespace mastorss`