remwharead/src/lib/uri.cpp

723 lines
21 KiB
C++

/* This file is part of remwharead.
* Copyright © 2019 tastytea <tastytea@tastytea.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "uri.hpp"
#include "version.hpp"
#include <boost/locale.hpp>
#include <Poco/Environment.h>
#include <Poco/Exception.h>
#include <Poco/Net/HTTPClientSession.h>
#include <Poco/Net/HTTPRequest.h>
#include <Poco/Net/HTTPResponse.h>
#include <Poco/Net/HTTPSClientSession.h>
#include <Poco/RegularExpression.h>
#include <Poco/StreamCopier.h>
#include <Poco/URI.h>
#include <codecvt>
#include <cstdint>
#include <exception>
#include <iostream>
#include <iterator>
#include <locale>
#include <sstream>
#include <utility>
#include <vector>
namespace remwharead
{
using std::array;
using std::istream;
using std::unique_ptr;
using std::make_unique;
using std::vector;
using std::cerr;
using std::endl;
using std::move;
using std::uint32_t;
using Poco::Net::HTTPClientSession;
using Poco::Net::HTTPSClientSession;
using Poco::Net::HTTPRequest;
using Poco::Net::HTTPResponse;
using Poco::Net::HTTPMessage;
using Poco::StreamCopier;
using Poco::Environment;
using RegEx = Poco::RegularExpression;
html_extract::operator bool()
{
return successful;
}
archive_answer::operator bool()
{
return successful;
}
URI::URI(string uri)
: _uri(move(uri))
{
// FIXME(tastytea): Only call locale-stuff once after getting rid OF POCO.
// Set global locale with Boost extras. Needed for Boost functions.
const boost::locale::generator locgen;
const std::locale loc = locgen("");
std::locale::global(loc);
Poco::Net::initializeSSL();
set_proxy();
}
void URI::set_proxy()
{
try
{
HTTPClientSession::ProxyConfig proxy;
const string env_proxy = Environment::get("http_proxy");
const RegEx re_proxy("^(?:https?://)?(?:([^:]+):?([^@]*)@)?" // user:pw
"([^:/]+)(?::([\\d]{1,5}))?/?$"); // host:port
vector<string> matches;
if (re_proxy.split(env_proxy, matches) < 4)
{
return;
}
proxy.username = matches[1];
proxy.password = matches[2];
proxy.host = matches[3];
if (!matches[4].empty())
{
// NOLINTNEXTLINE(google-runtime-int) - Need to use same as stoul.
const unsigned long port = std::stoul(matches[4]);
if (port > 65535)
{
throw std::invalid_argument("Proxy port number out of range");
}
proxy.port = static_cast<uint16_t>(port);
}
HTTPClientSession::setGlobalProxyConfig(proxy);
}
catch (const Poco::RegularExpressionException &e)
{
cerr << "Error: Proxy could not be set (" << e.displayText() << ")\n";
}
catch (const std::invalid_argument &e)
{
cerr << "Error: " << e.what() << endl;
}
catch (const Poco::NotFoundException &)
{
// No proxy found, no problem.
}
catch (const std::exception &e)
{
cerr << "Unexpected exception: " << e.what() << endl;
}
}
URI::~URI()
{
Poco::Net::uninitializeSSL();
}
html_extract URI::get()
{
try
{
_document = make_request(_uri);
_document = to_utf8(_document);
if (!_document.empty())
{
return
{
true,
"",
extract_title(),
extract_description(),
strip_html()
};
}
}
catch (const Poco::Exception &e)
{
return { false, e.displayText(), "", "", "" };
}
return { false, "Unknown error.", "", "", "" };
}
string URI::make_request(const string &uri, bool archive) const
{
Poco::URI poco_uri(uri);
string method = archive ? HTTPRequest::HTTP_HEAD : HTTPRequest::HTTP_GET;
string path = poco_uri.getPathAndQuery();
if (path.empty())
{
path = "/";
}
unique_ptr<HTTPClientSession> session;
if (poco_uri.getScheme() == "https")
{
session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
poco_uri.getPort());
}
else if (poco_uri.getScheme() == "http")
{
session = make_unique<HTTPClientSession>(poco_uri.getHost(),
poco_uri.getPort());
}
else
{
// NOLINTNEXTLINE(cert-err60-cpp)
throw Poco::Exception("Protocol not supported.");
}
HTTPRequest request(method, path, HTTPMessage::HTTP_1_1);
request.set("User-Agent", string("remwharead/") + global::version);
HTTPResponse response;
session->sendRequest(request);
istream &rs = session->receiveResponse(response);
// Not using the constants because some are too new for Debian stretch.
switch (response.getStatus())
{
case 301: // HTTPResponse::HTTP_MOVED_PERMANENTLY
case 308: // HTTPResponse::HTTP_PERMANENT_REDIRECT
case 302: // HTTPResponse::HTTP_FOUND
case 303: // HTTPResponse::HTTP_SEE_OTHER
case 307: // HTTPResponse::HTTP_TEMPORARY_REDIRECT
{
string location = response.get("Location");
if (location.substr(0, 4) != "http")
{
location = poco_uri.getScheme() + "://" + poco_uri.getHost()
+ location;
}
return make_request(location, archive);
}
case HTTPResponse::HTTP_OK:
{
string answer;
if (archive)
{
answer = response.get("Content-Location");
}
else
{
StreamCopier::copyToString(rs, answer);
}
return answer;
}
default:
{
throw Poco::Exception(response.getReason()); // NOLINT(cert-err60-cpp)
return "";
}
}
}
string URI::extract_title() const
{
if (is_html())
{
const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS);
vector<string> matches;
re_title.split(_document, matches);
if (matches.size() >= 2)
{
return remove_newlines(unescape_html(matches[1]));
}
}
return "";
}
string URI::extract_description() const
{
if (is_html())
{
const RegEx re_desc(R"(description"[^>]+content="([^"]+))",
RegEx::RE_CASELESS);
vector<string> matches;
re_desc.split(_document, matches);
if (matches.size() >= 2)
{
return cut_text(remove_newlines(unescape_html(matches[1])), 500);
}
}
return "";
}
string URI::strip_html() const
{
string out;
out = remove_html_tags(_document, "script"); // Remove JavaScript.
out = remove_html_tags(out, "style"); // Remove CSS.
out = remove_html_tags(out); // Remove tags.
size_t pos = 0;
while ((pos = out.find('\r', pos)) != std::string::npos) // Remove CR.
{
out.replace(pos, 1, "");
}
// Remove whitespace at eol.
RegEx("\\s+\n").subst(out, "\n", RegEx::RE_GLOBAL);
RegEx("\n{2,}").subst(out, "\n", RegEx::RE_GLOBAL); // Reduce newlines.
return unescape_html(out);
}
string URI::remove_html_tags(const string &html, const string &tag) const
{
// NOTE: I did this with regex_replace before, but libstdc++ segfaulted.
string out;
if (tag.empty())
{
size_t pos = 0;
while (pos != std::string::npos)
{
size_t startpos = html.find('<', pos);
size_t endpos = html.find('>', startpos);
out += html.substr(pos, startpos - pos);
pos = endpos;
if (pos != std::string::npos)
{
++pos;
}
}
}
else
{
size_t pos = 0;
out = html;
while ((pos = out.find("<" + tag)) != std::string::npos)
{
size_t endpos = out.find("</" + tag, pos);
if (endpos == std::string::npos)
{
break;
}
endpos += 3 + tag.length(); // tag + </ + >
out.replace(pos, endpos - pos, "");
}
}
return out;
}
string URI::unescape_html(string html) const
{
// Used to convert int to utf-8 char.
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
const RegEx re_entity("&#(x)?([[:alnum:]]{1,8});");
RegEx::MatchVec matches;
string::size_type pos = 0;
while (re_entity.match(html, pos, matches) != 0)
{
char32_t codepoint = 0;
const string number = html.substr(matches[2].offset, matches[2].length);
// 'x' in front of the number means it's hexadecimal, else decimal.
if (matches[1].length != 0)
{
codepoint = static_cast<char32_t>(std::stoul(number, nullptr, 16));
}
else
{
codepoint = static_cast<char32_t>(std::stoi(number, nullptr, 10));
}
const string unicode = u8c.to_bytes(codepoint);
html.replace(matches[0].offset, matches[0].length, unicode);
pos = matches[0].offset + unicode.length();
}
// Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
// entity_references#Character_entity_references_in_HTML
const array<const std::pair<const string, const char32_t>, 258> names =
{{
{ "exclamation", 0x0021 },
{ "quot", 0x0022 },
{ "percent", 0x0025 },
{ "amp", 0x0026 },
{ "apos", 0x0027 },
{ "add", 0x002B },
{ "lt", 0x003C },
{ "equal", 0x003D },
{ "gt", 0x003E },
{ "nbsp", 0x00A0 },
{ "iexcl", 0x00A1 },
{ "cent", 0x00A2 },
{ "pound", 0x00A3 },
{ "curren", 0x00A4 },
{ "yen", 0x00A5 },
{ "brvbar", 0x00A6 },
{ "sect", 0x00A7 },
{ "uml", 0x00A8 },
{ "copy", 0x00A9 },
{ "ordf", 0x00AA },
{ "laquo", 0x00AB },
{ "not", 0x00AC },
{ "shy", 0x00AD },
{ "reg", 0x00AE },
{ "macr", 0x00AF },
{ "deg", 0x00B0 },
{ "plusmn", 0x00B1 },
{ "sup2", 0x00B2 },
{ "sup3", 0x00B3 },
{ "acute", 0x00B4 },
{ "micro", 0x00B5 },
{ "para", 0x00B6 },
{ "middot", 0x00B7 },
{ "cedil", 0x00B8 },
{ "sup1", 0x00B9 },
{ "ordm", 0x00BA },
{ "raquo", 0x00BB },
{ "frac14", 0x00BC },
{ "frac12", 0x00BD },
{ "frac34", 0x00BE },
{ "iquest", 0x00BF },
{ "Agrave", 0x00C0 },
{ "Aacute", 0x00C1 },
{ "Acirc", 0x00C2 },
{ "Atilde", 0x00C3 },
{ "Auml", 0x00C4 },
{ "Aring", 0x00C5 },
{ "AElig", 0x00C6 },
{ "Ccedil", 0x00C7 },
{ "Egrave", 0x00C8 },
{ "Eacute", 0x00C9 },
{ "Ecirc", 0x00CA },
{ "Euml", 0x00CB },
{ "Igrave", 0x00CC },
{ "Iacute", 0x00CD },
{ "Icirc", 0x00CE },
{ "Iuml", 0x00CF },
{ "ETH", 0x00D0 },
{ "Ntilde", 0x00D1 },
{ "Ograve", 0x00D2 },
{ "Oacute", 0x00D3 },
{ "Ocirc", 0x00D4 },
{ "Otilde", 0x00D5 },
{ "Ouml", 0x00D6 },
{ "times", 0x00D7 },
{ "Oslash", 0x00D8 },
{ "Ugrave", 0x00D9 },
{ "Uacute", 0x00DA },
{ "Ucirc", 0x00DB },
{ "Uuml", 0x00DC },
{ "Yacute", 0x00DD },
{ "THORN", 0x00DE },
{ "szlig", 0x00DF },
{ "agrave", 0x00E0 },
{ "aacute", 0x00E1 },
{ "acirc", 0x00E2 },
{ "atilde", 0x00E3 },
{ "auml", 0x00E4 },
{ "aring", 0x00E5 },
{ "aelig", 0x00E6 },
{ "ccedil", 0x00E7 },
{ "egrave", 0x00E8 },
{ "eacute", 0x00E9 },
{ "ecirc", 0x00EA },
{ "euml", 0x00EB },
{ "igrave", 0x00EC },
{ "iacute", 0x00ED },
{ "icirc", 0x00EE },
{ "iuml", 0x00EF },
{ "eth", 0x00F0 },
{ "ntilde", 0x00F1 },
{ "ograve", 0x00F2 },
{ "oacute", 0x00F3 },
{ "ocirc", 0x00F4 },
{ "otilde", 0x00F5 },
{ "ouml", 0x00F6 },
{ "divide", 0x00F7 },
{ "oslash", 0x00F8 },
{ "ugrave", 0x00F9 },
{ "uacute", 0x00FA },
{ "ucirc", 0x00FB },
{ "uuml", 0x00FC },
{ "yacute", 0x00FD },
{ "thorn", 0x00FE },
{ "yuml", 0x00FF },
{ "OElig", 0x0152 },
{ "oelig", 0x0153 },
{ "Scaron", 0x0160 },
{ "scaron", 0x0161 },
{ "Yuml", 0x0178 },
{ "fnof", 0x0192 },
{ "circ", 0x02C6 },
{ "tilde", 0x02DC },
{ "Alpha", 0x0391 },
{ "Beta", 0x0392 },
{ "Gamma", 0x0393 },
{ "Delta", 0x0394 },
{ "Epsilon", 0x0395 },
{ "Zeta", 0x0396 },
{ "Eta", 0x0397 },
{ "Theta", 0x0398 },
{ "Iota", 0x0399 },
{ "Kappa", 0x039A },
{ "Lambda", 0x039B },
{ "Mu", 0x039C },
{ "Nu", 0x039D },
{ "Xi", 0x039E },
{ "Omicron", 0x039F },
{ "Pi", 0x03A0 },
{ "Rho", 0x03A1 },
{ "Sigma", 0x03A3 },
{ "Tau", 0x03A4 },
{ "Upsilon", 0x03A5 },
{ "Phi", 0x03A6 },
{ "Chi", 0x03A7 },
{ "Psi", 0x03A8 },
{ "Omega", 0x03A9 },
{ "alpha", 0x03B1 },
{ "beta", 0x03B2 },
{ "gamma", 0x03B3 },
{ "delta", 0x03B4 },
{ "epsilon", 0x03B5 },
{ "zeta", 0x03B6 },
{ "eta", 0x03B7 },
{ "theta", 0x03B8 },
{ "iota", 0x03B9 },
{ "kappa", 0x03BA },
{ "lambda", 0x03BB },
{ "mu", 0x03BC },
{ "nu", 0x03BD },
{ "xi", 0x03BE },
{ "omicron", 0x03BF },
{ "pi", 0x03C0 },
{ "rho", 0x03C1 },
{ "sigmaf", 0x03C2 },
{ "sigma", 0x03C3 },
{ "tau", 0x03C4 },
{ "upsilon", 0x03C5 },
{ "phi", 0x03C6 },
{ "chi", 0x03C7 },
{ "psi", 0x03C8 },
{ "omega", 0x03C9 },
{ "thetasym", 0x03D1 },
{ "upsih", 0x03D2 },
{ "piv", 0x03D6 },
{ "ensp", 0x2002 },
{ "emsp", 0x2003 },
{ "thinsp", 0x2009 },
{ "zwnj", 0x200C },
{ "zwj", 0x200D },
{ "lrm", 0x200E },
{ "rlm", 0x200F },
{ "ndash", 0x2013 },
{ "mdash", 0x2014 },
{ "horbar", 0x2015 },
{ "lsquo", 0x2018 },
{ "rsquo", 0x2019 },
{ "sbquo", 0x201A },
{ "ldquo", 0x201C },
{ "rdquo", 0x201D },
{ "bdquo", 0x201E },
{ "dagger", 0x2020 },
{ "Dagger", 0x2021 },
{ "bull", 0x2022 },
{ "hellip", 0x2026 },
{ "permil", 0x2030 },
{ "prime", 0x2032 },
{ "Prime", 0x2033 },
{ "lsaquo", 0x2039 },
{ "rsaquo", 0x203A },
{ "oline", 0x203E },
{ "frasl", 0x2044 },
{ "euro", 0x20AC },
{ "image", 0x2111 },
{ "weierp", 0x2118 },
{ "real", 0x211C },
{ "trade", 0x2122 },
{ "alefsym", 0x2135 },
{ "larr", 0x2190 },
{ "uarr", 0x2191 },
{ "rarr", 0x2192 },
{ "darr", 0x2193 },
{ "harr", 0x2194 },
{ "crarr", 0x21B5 },
{ "lArr", 0x21D0 },
{ "uArr", 0x21D1 },
{ "rArr", 0x21D2 },
{ "dArr", 0x21D3 },
{ "hArr", 0x21D4 },
{ "forall", 0x2200 },
{ "part", 0x2202 },
{ "exist", 0x2203 },
{ "empty", 0x2205 },
{ "nabla", 0x2207 },
{ "isin", 0x2208 },
{ "notin", 0x2209 },
{ "ni", 0x220B },
{ "prod", 0x220F },
{ "sum", 0x2211 },
{ "minus", 0x2212 },
{ "lowast", 0x2217 },
{ "radic", 0x221A },
{ "prop", 0x221D },
{ "infin", 0x221E },
{ "ang", 0x2220 },
{ "and", 0x2227 },
{ "or", 0x2228 },
{ "cap", 0x2229 },
{ "cup", 0x222A },
{ "int", 0x222B },
{ "there4", 0x2234 },
{ "sim", 0x223C },
{ "cong", 0x2245 },
{ "asymp", 0x2248 },
{ "ne", 0x2260 },
{ "equiv", 0x2261 },
{ "le", 0x2264 },
{ "ge", 0x2265 },
{ "sub", 0x2282 },
{ "sup", 0x2283 },
{ "nsub", 0x2284 },
{ "sube", 0x2286 },
{ "supe", 0x2287 },
{ "oplus", 0x2295 },
{ "otimes", 0x2297 },
{ "perp", 0x22A5 },
{ "sdot", 0x22C5 },
{ "lceil", 0x2308 },
{ "rceil", 0x2309 },
{ "lfloor", 0x230A },
{ "rfloor", 0x230B },
{ "lang", 0x2329 },
{ "rang", 0x232A },
{ "loz", 0x25CA },
{ "spades", 0x2660 },
{ "clubs", 0x2663 },
{ "hearts", 0x2665 },
{ "diams", 0x2666 }
}};
for (auto &pair : names)
{
const RegEx re('&' + pair.first + ';');
re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
}
return html;
}
archive_answer URI::archive() const
{
if (_uri.substr(0, 4) != "http")
{
return { false, "Only HTTP(S) is archivable.", "" };
}
try
{
const string answer = make_request("https://web.archive.org/save/"
+ _uri, true);
if (!answer.empty())
{
return { true, "", "https://web.archive.org" + answer };
}
}
catch (const Poco::Exception &e)
{
return { false, e.displayText(), "" };
}
return { false, "Unknown error.", "" };
}
string URI::remove_newlines(string text) const
{
size_t posn = 0;
while ((posn = text.find('\n', posn)) != std::string::npos)
{
text.replace(posn, 1, " ");
size_t posr = posn - 1;
if (text[posr] == '\r')
{
text.replace(posr, 1, " ");
}
++posn;
}
return text;
}
string URI::cut_text(const string &text, const uint16_t n_chars) const
{
if (text.size() > n_chars)
{
constexpr char suffix[] = " […]";
constexpr auto suffix_len = std::end(suffix) - std::begin(suffix) - 1;
if (n_chars <= suffix_len)
{
throw std::invalid_argument("n_chars has to be greater than "
+ std::to_string(suffix_len));
}
const size_t pos =
text.rfind(' ', static_cast<size_t>(n_chars - suffix_len));
return text.substr(0, pos) + suffix;
}
return text;
}
string URI::to_utf8(const string &str)
{
if (_encoding.empty())
{
detect_encoding();
}
if (_encoding == "utf-8")
{
return str;
}
return boost::locale::conv::to_utf<char>(str, _encoding);
}
void URI::detect_encoding()
{
const RegEx re_encoding(R"(<meta.+charset="([^";]+))", RegEx::RE_CASELESS);
vector<string> matches;
re_encoding.split(_document, matches);
if (matches.size() >= 2)
{
_encoding = boost::locale::to_lower(matches[1]);
}
}
bool URI::is_html() const
{
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
return (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri));
}
} // namespace remwharead