/* This file is part of mastorss. * Copyright © 2019 tastytea * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 3. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include "document.hpp" #include "exceptions.hpp" #include "version.hpp" #include #include #include #include #include #include #include #include #include namespace mastorss { using boost::regex; using boost::regex_replace; using std::list; using std::istringstream; using std::string; using std::move; Document::Document(Config &cfg) : _cfg{cfg} , _data{cfg.data} { RestClient::init(); download(); } Document::~Document() { RestClient::disable(); } void Document::download(const string &uri) { RestClient::Connection connection{uri}; connection.SetUserAgent(string("mastorss/").append(version)); connection.FollowRedirects(false); RestClient::Response response{connection.get("")}; switch (response.code) { case 200: { _raw_doc = response.body; BOOST_LOG_TRIVIAL(debug) << "Downloaded feed: " << _data.feedurl; break; } case 301: case 308: { _data.feedurl = extract_location(response.headers); if (_data.feedurl.empty()) { throw HTTPException{response.code}; } BOOST_LOG_TRIVIAL(debug) << "Feed has new location (permanent): " << _data.feedurl; _cfg.write(); download(); break; } case 302: case 303: case 307: { const string newuri{extract_location(response.headers)}; if (newuri.empty()) { throw HTTPException{response.code}; } BOOST_LOG_TRIVIAL(debug) << "Feed has new location (temporary): " << _data.feedurl; download(newuri); break; } case -1: { throw CURLException{errno}; } default: { throw HTTPException{response.code}; } } } void Document::download() { download(_data.feedurl); } void Document::parse() { pt::ptree tree; istringstream iss{_raw_doc}; pt::read_xml(iss, tree); if (tree.front().first == "rss") { BOOST_LOG_TRIVIAL(debug) << "RSS detected."; parse_rss(tree); } } void Document::parse_rss(const pt::ptree &tree) { for (const auto &child : tree.get_child("rss.channel")) { if (child.first == "item") { const auto &rssitem = child.second; string guid{rssitem.get("guid")}; if (guid.empty()) // We hope either or are present. { guid = rssitem.get("link"); } if (guid == _data.last_guid) { break; } bool skipthis{false}; string title{rssitem.get("title")}; for (const auto &skip : _data.skip) { if (title.substr(0, skip.length()) == skip) { skipthis = true; break; } } if (skipthis) { BOOST_LOG_TRIVIAL(debug) << "Skipped GUID: " << guid; continue; } Item item; item.description = [&] { string desc {remove_html(rssitem.get("description"))}; for (const auto &fix : _data.fixes) { desc = regex_replace(desc, regex{fix}, ""); } return desc; }(); item.guid = move(guid); item.link = rssitem.get("link"); item.title = move(title); new_items.push_back(item); BOOST_LOG_TRIVIAL(debug) << "Found GUID: " << item.guid; } } } string Document::remove_html(string html) const { html = Mastodon::unescape_html(html); // Decode HTML entities. html = regex_replace(html, regex{"

"}, "\n\n"); const list re_list { regex{R"()"}, // CDATA end. regex{"<[^>]+>"}, // HTML tags. regex{R"(\r)"}, // Carriage return. regex{"\\n[ \\t\u00a0]+\\n"}, // Whitespace between newlines. regex{R"(^\n+)"} // Newlines at the beginning. }; for (const regex &re : re_list) { html = regex_replace(html, re, ""); } // Remove excess newlines. html = regex_replace(html, regex{R"(\n{3,})"}, "\n\n"); // Replace single newlines with spaces (?<= is lookbehind, ?= is lookahead). html = regex_replace(html, regex{R"((?<=[^\n])\n(?=[^\n]))"}, " "); BOOST_LOG_TRIVIAL(debug) << "Converted HTML to text."; return html; } string Document::extract_location(const RestClient::HeaderFields &headers) const { string location{headers.at("Location")}; if (location.empty()) { location = headers.at("location"); } return location; } } // namespace mastorss