2019-05-15 07:04:12 +02:00
|
|
|
/* This file is part of remwharead.
|
2020-02-07 11:56:05 +01:00
|
|
|
* Copyright © 2019, 2020 tastytea <tastytea@tastytea.de>
|
2019-05-15 07:04:12 +02:00
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
#include "uri.hpp"
|
2020-10-31 13:21:40 +01:00
|
|
|
|
2020-10-31 16:41:59 +01:00
|
|
|
#include "curl_wrapper.hpp"
|
2019-09-30 13:20:36 +02:00
|
|
|
#include "version.hpp"
|
2020-10-31 13:21:40 +01:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
#include <Poco/RegularExpression.h>
|
2020-10-31 13:21:40 +01:00
|
|
|
#include <boost/locale.hpp>
|
|
|
|
|
2020-10-31 16:41:59 +01:00
|
|
|
#include <atomic>
|
2019-09-30 13:20:36 +02:00
|
|
|
#include <codecvt>
|
|
|
|
#include <cstdint>
|
|
|
|
#include <exception>
|
|
|
|
#include <iostream>
|
2019-10-15 15:10:39 +02:00
|
|
|
#include <iterator>
|
2019-09-30 13:20:36 +02:00
|
|
|
#include <locale>
|
2020-10-31 16:41:59 +01:00
|
|
|
#include <stdexcept>
|
|
|
|
#include <string>
|
2019-09-30 13:20:36 +02:00
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
namespace remwharead
|
2019-05-15 07:04:12 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
using std::array;
|
2020-10-31 16:41:59 +01:00
|
|
|
using std::exception;
|
2019-09-30 13:20:36 +02:00
|
|
|
using std::move;
|
2020-10-31 16:41:59 +01:00
|
|
|
using std::to_string;
|
2019-10-25 06:00:36 +02:00
|
|
|
using std::uint32_t;
|
2020-10-31 13:21:40 +01:00
|
|
|
using std::vector;
|
2019-09-30 13:20:36 +02:00
|
|
|
using RegEx = Poco::RegularExpression;
|
|
|
|
|
2020-10-31 13:25:02 +01:00
|
|
|
html_extract::operator bool() const
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
return successful;
|
|
|
|
}
|
2019-08-06 12:13:27 +02:00
|
|
|
|
2020-10-31 13:25:02 +01:00
|
|
|
archive_answer::operator bool() const
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
return successful;
|
|
|
|
}
|
2019-08-06 12:40:52 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
URI::URI(string uri)
|
2019-12-11 13:00:43 +01:00
|
|
|
: _uri(move(uri))
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
2020-10-31 13:21:40 +01:00
|
|
|
// FIXME: Only call locale-stuff once after getting rid of POCO.
|
2019-12-17 13:35:39 +01:00
|
|
|
// Set global locale with Boost extras. Needed for Boost functions.
|
2020-10-31 13:21:40 +01:00
|
|
|
// Uhm… I don't remember what I meant with the above. 🤦
|
2019-12-17 13:35:39 +01:00
|
|
|
const boost::locale::generator locgen;
|
|
|
|
const std::locale loc = locgen("");
|
|
|
|
std::locale::global(loc);
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
html_extract URI::get()
|
|
|
|
{
|
|
|
|
try
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
2020-10-31 16:41:59 +01:00
|
|
|
CURLWrapper curl;
|
|
|
|
_document = to_utf8(curl.make_request(_uri, false));
|
|
|
|
|
2019-12-11 13:00:43 +01:00
|
|
|
if (!_document.empty())
|
2019-05-15 07:04:12 +02:00
|
|
|
{
|
2020-10-31 13:21:40 +01:00
|
|
|
return {true, "", extract_title(), extract_description(),
|
|
|
|
strip_html()};
|
2019-05-15 07:04:12 +02:00
|
|
|
}
|
|
|
|
}
|
2020-10-31 16:41:59 +01:00
|
|
|
catch (const exception &e)
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
2020-10-31 16:41:59 +01:00
|
|
|
return {false, e.what(), "", "", ""};
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
|
|
|
|
2020-10-31 13:21:40 +01:00
|
|
|
return {false, "Unknown error.", "", "", ""};
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
2019-07-27 09:59:43 +02:00
|
|
|
|
2019-12-11 13:00:43 +01:00
|
|
|
string URI::extract_title() const
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
2019-12-11 13:00:43 +01:00
|
|
|
if (is_html())
|
2019-05-18 01:47:10 +02:00
|
|
|
{
|
2019-12-02 15:41:02 +01:00
|
|
|
const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS);
|
2019-09-30 13:20:36 +02:00
|
|
|
vector<string> matches;
|
2019-12-11 13:00:43 +01:00
|
|
|
re_title.split(_document, matches);
|
2019-09-30 13:20:36 +02:00
|
|
|
if (matches.size() >= 2)
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
return remove_newlines(unescape_html(matches[1]));
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
|
|
|
}
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
return "";
|
|
|
|
}
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-12-11 13:00:43 +01:00
|
|
|
string URI::extract_description() const
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
2019-12-11 13:00:43 +01:00
|
|
|
if (is_html())
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
const RegEx re_desc(R"(description"[^>]+content="([^"]+))",
|
|
|
|
RegEx::RE_CASELESS);
|
|
|
|
vector<string> matches;
|
2019-12-11 13:00:43 +01:00
|
|
|
re_desc.split(_document, matches);
|
2019-09-30 13:20:36 +02:00
|
|
|
if (matches.size() >= 2)
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
2019-12-11 13:00:43 +01:00
|
|
|
return cut_text(remove_newlines(unescape_html(matches[1])), 500);
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return "";
|
|
|
|
}
|
2019-05-16 12:03:09 +02:00
|
|
|
|
2019-12-11 13:00:43 +01:00
|
|
|
string URI::strip_html() const
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
string out;
|
|
|
|
|
2019-12-11 13:00:43 +01:00
|
|
|
out = remove_html_tags(_document, "script"); // Remove JavaScript.
|
2020-10-31 13:21:40 +01:00
|
|
|
out = remove_html_tags(out, "style"); // Remove CSS.
|
|
|
|
out = remove_html_tags(out); // Remove tags.
|
2019-05-17 05:43:17 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
size_t pos = 0;
|
|
|
|
while ((pos = out.find('\r', pos)) != std::string::npos) // Remove CR.
|
|
|
|
{
|
|
|
|
out.replace(pos, 1, "");
|
2019-05-17 05:43:17 +02:00
|
|
|
}
|
2019-08-06 12:40:52 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
// Remove whitespace at eol.
|
|
|
|
RegEx("\\s+\n").subst(out, "\n", RegEx::RE_GLOBAL);
|
|
|
|
RegEx("\n{2,}").subst(out, "\n", RegEx::RE_GLOBAL); // Reduce newlines.
|
|
|
|
|
|
|
|
return unescape_html(out);
|
|
|
|
}
|
|
|
|
|
2020-01-27 06:32:28 +01:00
|
|
|
string URI::remove_html_tags(const string &html, const string &tag)
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
// NOTE: I did this with regex_replace before, but libstdc++ segfaulted.
|
|
|
|
string out;
|
|
|
|
if (tag.empty())
|
2019-05-17 05:43:17 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
size_t pos = 0;
|
|
|
|
while (pos != std::string::npos)
|
2019-05-17 05:43:17 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
size_t startpos = html.find('<', pos);
|
|
|
|
size_t endpos = html.find('>', startpos);
|
|
|
|
out += html.substr(pos, startpos - pos);
|
|
|
|
pos = endpos;
|
|
|
|
if (pos != std::string::npos)
|
2019-05-17 05:43:17 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
++pos;
|
2019-05-17 05:43:17 +02:00
|
|
|
}
|
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
size_t pos = 0;
|
|
|
|
out = html;
|
|
|
|
while ((pos = out.find("<" + tag)) != std::string::npos)
|
2019-05-17 03:00:07 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
size_t endpos = out.find("</" + tag, pos);
|
|
|
|
if (endpos == std::string::npos)
|
2019-05-17 05:43:17 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
break;
|
2019-05-17 05:43:17 +02:00
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
endpos += 3 + tag.length(); // tag + </ + >
|
|
|
|
out.replace(pos, endpos - pos, "");
|
2019-05-17 03:00:07 +02:00
|
|
|
}
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
return out;
|
|
|
|
}
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2020-01-27 06:32:28 +01:00
|
|
|
string URI::unescape_html(string html)
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
// Used to convert int to utf-8 char.
|
|
|
|
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
|
|
|
|
const RegEx re_entity("&#(x)?([[:alnum:]]{1,8});");
|
|
|
|
RegEx::MatchVec matches;
|
|
|
|
string::size_type pos = 0;
|
|
|
|
|
|
|
|
while (re_entity.match(html, pos, matches) != 0)
|
|
|
|
{
|
|
|
|
char32_t codepoint = 0;
|
|
|
|
const string number = html.substr(matches[2].offset, matches[2].length);
|
|
|
|
// 'x' in front of the number means it's hexadecimal, else decimal.
|
|
|
|
if (matches[1].length != 0)
|
2019-05-15 07:04:12 +02:00
|
|
|
{
|
2019-10-25 06:00:36 +02:00
|
|
|
codepoint = static_cast<char32_t>(std::stoul(number, nullptr, 16));
|
2019-05-15 07:04:12 +02:00
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
else
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
2019-10-25 06:00:36 +02:00
|
|
|
codepoint = static_cast<char32_t>(std::stoi(number, nullptr, 10));
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
const string unicode = u8c.to_bytes(codepoint);
|
|
|
|
html.replace(matches[0].offset, matches[0].length, unicode);
|
|
|
|
pos = matches[0].offset + unicode.length();
|
|
|
|
}
|
2019-05-15 22:24:11 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
// Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
|
|
|
|
// entity_references#Character_entity_references_in_HTML
|
2020-10-31 13:21:40 +01:00
|
|
|
const array<const std::pair<const string, const char32_t>, 258> names = {
|
|
|
|
{{"exclamation", 0x0021}, {"quot", 0x0022}, {"percent", 0x0025},
|
|
|
|
{"amp", 0x0026}, {"apos", 0x0027}, {"add", 0x002B},
|
|
|
|
{"lt", 0x003C}, {"equal", 0x003D}, {"gt", 0x003E},
|
|
|
|
{"nbsp", 0x00A0}, {"iexcl", 0x00A1}, {"cent", 0x00A2},
|
|
|
|
{"pound", 0x00A3}, {"curren", 0x00A4}, {"yen", 0x00A5},
|
|
|
|
{"brvbar", 0x00A6}, {"sect", 0x00A7}, {"uml", 0x00A8},
|
|
|
|
{"copy", 0x00A9}, {"ordf", 0x00AA}, {"laquo", 0x00AB},
|
|
|
|
{"not", 0x00AC}, {"shy", 0x00AD}, {"reg", 0x00AE},
|
|
|
|
{"macr", 0x00AF}, {"deg", 0x00B0}, {"plusmn", 0x00B1},
|
|
|
|
{"sup2", 0x00B2}, {"sup3", 0x00B3}, {"acute", 0x00B4},
|
|
|
|
{"micro", 0x00B5}, {"para", 0x00B6}, {"middot", 0x00B7},
|
|
|
|
{"cedil", 0x00B8}, {"sup1", 0x00B9}, {"ordm", 0x00BA},
|
|
|
|
{"raquo", 0x00BB}, {"frac14", 0x00BC}, {"frac12", 0x00BD},
|
|
|
|
{"frac34", 0x00BE}, {"iquest", 0x00BF}, {"Agrave", 0x00C0},
|
|
|
|
{"Aacute", 0x00C1}, {"Acirc", 0x00C2}, {"Atilde", 0x00C3},
|
|
|
|
{"Auml", 0x00C4}, {"Aring", 0x00C5}, {"AElig", 0x00C6},
|
|
|
|
{"Ccedil", 0x00C7}, {"Egrave", 0x00C8}, {"Eacute", 0x00C9},
|
|
|
|
{"Ecirc", 0x00CA}, {"Euml", 0x00CB}, {"Igrave", 0x00CC},
|
|
|
|
{"Iacute", 0x00CD}, {"Icirc", 0x00CE}, {"Iuml", 0x00CF},
|
|
|
|
{"ETH", 0x00D0}, {"Ntilde", 0x00D1}, {"Ograve", 0x00D2},
|
|
|
|
{"Oacute", 0x00D3}, {"Ocirc", 0x00D4}, {"Otilde", 0x00D5},
|
|
|
|
{"Ouml", 0x00D6}, {"times", 0x00D7}, {"Oslash", 0x00D8},
|
|
|
|
{"Ugrave", 0x00D9}, {"Uacute", 0x00DA}, {"Ucirc", 0x00DB},
|
|
|
|
{"Uuml", 0x00DC}, {"Yacute", 0x00DD}, {"THORN", 0x00DE},
|
|
|
|
{"szlig", 0x00DF}, {"agrave", 0x00E0}, {"aacute", 0x00E1},
|
|
|
|
{"acirc", 0x00E2}, {"atilde", 0x00E3}, {"auml", 0x00E4},
|
|
|
|
{"aring", 0x00E5}, {"aelig", 0x00E6}, {"ccedil", 0x00E7},
|
|
|
|
{"egrave", 0x00E8}, {"eacute", 0x00E9}, {"ecirc", 0x00EA},
|
|
|
|
{"euml", 0x00EB}, {"igrave", 0x00EC}, {"iacute", 0x00ED},
|
|
|
|
{"icirc", 0x00EE}, {"iuml", 0x00EF}, {"eth", 0x00F0},
|
|
|
|
{"ntilde", 0x00F1}, {"ograve", 0x00F2}, {"oacute", 0x00F3},
|
|
|
|
{"ocirc", 0x00F4}, {"otilde", 0x00F5}, {"ouml", 0x00F6},
|
|
|
|
{"divide", 0x00F7}, {"oslash", 0x00F8}, {"ugrave", 0x00F9},
|
|
|
|
{"uacute", 0x00FA}, {"ucirc", 0x00FB}, {"uuml", 0x00FC},
|
|
|
|
{"yacute", 0x00FD}, {"thorn", 0x00FE}, {"yuml", 0x00FF},
|
|
|
|
{"OElig", 0x0152}, {"oelig", 0x0153}, {"Scaron", 0x0160},
|
|
|
|
{"scaron", 0x0161}, {"Yuml", 0x0178}, {"fnof", 0x0192},
|
|
|
|
{"circ", 0x02C6}, {"tilde", 0x02DC}, {"Alpha", 0x0391},
|
|
|
|
{"Beta", 0x0392}, {"Gamma", 0x0393}, {"Delta", 0x0394},
|
|
|
|
{"Epsilon", 0x0395}, {"Zeta", 0x0396}, {"Eta", 0x0397},
|
|
|
|
{"Theta", 0x0398}, {"Iota", 0x0399}, {"Kappa", 0x039A},
|
|
|
|
{"Lambda", 0x039B}, {"Mu", 0x039C}, {"Nu", 0x039D},
|
|
|
|
{"Xi", 0x039E}, {"Omicron", 0x039F}, {"Pi", 0x03A0},
|
|
|
|
{"Rho", 0x03A1}, {"Sigma", 0x03A3}, {"Tau", 0x03A4},
|
|
|
|
{"Upsilon", 0x03A5}, {"Phi", 0x03A6}, {"Chi", 0x03A7},
|
|
|
|
{"Psi", 0x03A8}, {"Omega", 0x03A9}, {"alpha", 0x03B1},
|
|
|
|
{"beta", 0x03B2}, {"gamma", 0x03B3}, {"delta", 0x03B4},
|
|
|
|
{"epsilon", 0x03B5}, {"zeta", 0x03B6}, {"eta", 0x03B7},
|
|
|
|
{"theta", 0x03B8}, {"iota", 0x03B9}, {"kappa", 0x03BA},
|
|
|
|
{"lambda", 0x03BB}, {"mu", 0x03BC}, {"nu", 0x03BD},
|
|
|
|
{"xi", 0x03BE}, {"omicron", 0x03BF}, {"pi", 0x03C0},
|
|
|
|
{"rho", 0x03C1}, {"sigmaf", 0x03C2}, {"sigma", 0x03C3},
|
|
|
|
{"tau", 0x03C4}, {"upsilon", 0x03C5}, {"phi", 0x03C6},
|
|
|
|
{"chi", 0x03C7}, {"psi", 0x03C8}, {"omega", 0x03C9},
|
|
|
|
{"thetasym", 0x03D1}, {"upsih", 0x03D2}, {"piv", 0x03D6},
|
|
|
|
{"ensp", 0x2002}, {"emsp", 0x2003}, {"thinsp", 0x2009},
|
|
|
|
{"zwnj", 0x200C}, {"zwj", 0x200D}, {"lrm", 0x200E},
|
|
|
|
{"rlm", 0x200F}, {"ndash", 0x2013}, {"mdash", 0x2014},
|
|
|
|
{"horbar", 0x2015}, {"lsquo", 0x2018}, {"rsquo", 0x2019},
|
|
|
|
{"sbquo", 0x201A}, {"ldquo", 0x201C}, {"rdquo", 0x201D},
|
|
|
|
{"bdquo", 0x201E}, {"dagger", 0x2020}, {"Dagger", 0x2021},
|
|
|
|
{"bull", 0x2022}, {"hellip", 0x2026}, {"permil", 0x2030},
|
|
|
|
{"prime", 0x2032}, {"Prime", 0x2033}, {"lsaquo", 0x2039},
|
|
|
|
{"rsaquo", 0x203A}, {"oline", 0x203E}, {"frasl", 0x2044},
|
|
|
|
{"euro", 0x20AC}, {"image", 0x2111}, {"weierp", 0x2118},
|
|
|
|
{"real", 0x211C}, {"trade", 0x2122}, {"alefsym", 0x2135},
|
|
|
|
{"larr", 0x2190}, {"uarr", 0x2191}, {"rarr", 0x2192},
|
|
|
|
{"darr", 0x2193}, {"harr", 0x2194}, {"crarr", 0x21B5},
|
|
|
|
{"lArr", 0x21D0}, {"uArr", 0x21D1}, {"rArr", 0x21D2},
|
|
|
|
{"dArr", 0x21D3}, {"hArr", 0x21D4}, {"forall", 0x2200},
|
|
|
|
{"part", 0x2202}, {"exist", 0x2203}, {"empty", 0x2205},
|
|
|
|
{"nabla", 0x2207}, {"isin", 0x2208}, {"notin", 0x2209},
|
|
|
|
{"ni", 0x220B}, {"prod", 0x220F}, {"sum", 0x2211},
|
|
|
|
{"minus", 0x2212}, {"lowast", 0x2217}, {"radic", 0x221A},
|
|
|
|
{"prop", 0x221D}, {"infin", 0x221E}, {"ang", 0x2220},
|
|
|
|
{"and", 0x2227}, {"or", 0x2228}, {"cap", 0x2229},
|
|
|
|
{"cup", 0x222A}, {"int", 0x222B}, {"there4", 0x2234},
|
|
|
|
{"sim", 0x223C}, {"cong", 0x2245}, {"asymp", 0x2248},
|
|
|
|
{"ne", 0x2260}, {"equiv", 0x2261}, {"le", 0x2264},
|
|
|
|
{"ge", 0x2265}, {"sub", 0x2282}, {"sup", 0x2283},
|
|
|
|
{"nsub", 0x2284}, {"sube", 0x2286}, {"supe", 0x2287},
|
|
|
|
{"oplus", 0x2295}, {"otimes", 0x2297}, {"perp", 0x22A5},
|
|
|
|
{"sdot", 0x22C5}, {"lceil", 0x2308}, {"rceil", 0x2309},
|
|
|
|
{"lfloor", 0x230A}, {"rfloor", 0x230B}, {"lang", 0x2329},
|
|
|
|
{"rang", 0x232A}, {"loz", 0x25CA}, {"spades", 0x2660},
|
|
|
|
{"clubs", 0x2663}, {"hearts", 0x2665}, {"diams", 0x2666}}};
|
2019-09-30 13:20:36 +02:00
|
|
|
|
2020-10-31 13:30:40 +01:00
|
|
|
for (const auto &pair : names)
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
const RegEx re('&' + pair.first + ';');
|
|
|
|
re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
|
2019-05-16 07:53:28 +02:00
|
|
|
}
|
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
return html;
|
|
|
|
}
|
|
|
|
|
2019-10-27 20:38:58 +01:00
|
|
|
archive_answer URI::archive() const
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
if (_uri.substr(0, 4) != "http")
|
2019-05-15 22:24:11 +02:00
|
|
|
{
|
2020-10-31 13:21:40 +01:00
|
|
|
return {false, "Only HTTP(S) is archivable.", ""};
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
2019-05-15 22:24:11 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
try
|
|
|
|
{
|
2020-10-31 16:41:59 +01:00
|
|
|
CURLWrapper curl;
|
|
|
|
const string answer = curl.make_request("https://web.archive.org/save/"
|
|
|
|
+ _uri,
|
|
|
|
true);
|
2019-07-27 09:59:43 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
if (!answer.empty())
|
2019-05-18 12:46:58 +02:00
|
|
|
{
|
2020-10-31 13:21:40 +01:00
|
|
|
return {true, "", "https://web.archive.org" + answer};
|
2019-05-18 12:46:58 +02:00
|
|
|
}
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
2020-10-31 16:41:59 +01:00
|
|
|
catch (const exception &e)
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
2020-10-31 16:41:59 +01:00
|
|
|
return {false, e.what(), ""};
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
|
|
|
|
2020-10-31 13:21:40 +01:00
|
|
|
return {false, "Unknown error.", ""};
|
2019-09-30 13:20:36 +02:00
|
|
|
}
|
2019-05-16 08:36:35 +02:00
|
|
|
|
2020-01-27 06:32:28 +01:00
|
|
|
string URI::remove_newlines(string text)
|
2019-09-30 13:20:36 +02:00
|
|
|
{
|
|
|
|
size_t posn = 0;
|
|
|
|
while ((posn = text.find('\n', posn)) != std::string::npos)
|
2019-05-23 13:41:47 +02:00
|
|
|
{
|
2019-09-30 13:20:36 +02:00
|
|
|
text.replace(posn, 1, " ");
|
2019-07-27 09:59:43 +02:00
|
|
|
|
2019-09-30 13:20:36 +02:00
|
|
|
size_t posr = posn - 1;
|
|
|
|
if (text[posr] == '\r')
|
|
|
|
{
|
|
|
|
text.replace(posr, 1, " ");
|
2019-07-13 18:20:37 +02:00
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
++posn;
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
2019-09-30 13:20:36 +02:00
|
|
|
|
|
|
|
return text;
|
|
|
|
}
|
2019-10-15 15:10:39 +02:00
|
|
|
|
2020-01-27 06:32:28 +01:00
|
|
|
string URI::cut_text(const string &text, const uint16_t n_chars)
|
2019-10-15 15:10:39 +02:00
|
|
|
{
|
|
|
|
if (text.size() > n_chars)
|
|
|
|
{
|
|
|
|
constexpr char suffix[] = " […]";
|
|
|
|
constexpr auto suffix_len = std::end(suffix) - std::begin(suffix) - 1;
|
2019-10-25 06:00:36 +02:00
|
|
|
if (n_chars <= suffix_len)
|
|
|
|
{
|
|
|
|
throw std::invalid_argument("n_chars has to be greater than "
|
|
|
|
+ std::to_string(suffix_len));
|
|
|
|
}
|
2019-10-15 15:10:39 +02:00
|
|
|
|
2020-10-31 13:21:40 +01:00
|
|
|
const size_t pos = text.rfind(' ', static_cast<size_t>(n_chars
|
|
|
|
- suffix_len));
|
2019-10-15 15:10:39 +02:00
|
|
|
|
|
|
|
return text.substr(0, pos) + suffix;
|
|
|
|
}
|
|
|
|
|
|
|
|
return text;
|
|
|
|
}
|
|
|
|
|
2019-12-11 13:01:44 +01:00
|
|
|
string URI::to_utf8(const string &str)
|
|
|
|
{
|
|
|
|
if (_encoding.empty())
|
|
|
|
{
|
|
|
|
detect_encoding();
|
|
|
|
}
|
|
|
|
|
2019-12-11 17:16:35 +01:00
|
|
|
if (_encoding == "utf-8")
|
|
|
|
{
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2019-12-11 13:01:44 +01:00
|
|
|
return boost::locale::conv::to_utf<char>(str, _encoding);
|
|
|
|
}
|
|
|
|
|
|
|
|
void URI::detect_encoding()
|
|
|
|
{
|
2019-12-19 21:49:53 +01:00
|
|
|
const RegEx re_encoding(R"(<meta.+charset="([^";]+))", RegEx::RE_CASELESS);
|
2019-12-11 13:01:44 +01:00
|
|
|
vector<string> matches;
|
|
|
|
re_encoding.split(_document, matches);
|
|
|
|
if (matches.size() >= 2)
|
|
|
|
{
|
2019-12-11 17:16:35 +01:00
|
|
|
_encoding = boost::locale::to_lower(matches[1]);
|
2019-12-11 13:01:44 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-12-11 13:00:43 +01:00
|
|
|
bool URI::is_html() const
|
|
|
|
{
|
|
|
|
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
2019-12-11 17:16:06 +01:00
|
|
|
return (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri));
|
2019-12-11 13:00:43 +01:00
|
|
|
}
|
|
|
|
|
2019-09-25 03:58:29 +02:00
|
|
|
} // namespace remwharead
|