Download, archive and process an URI.
More...
#include <remwharead/uri.hpp>
Download, archive and process an URI.
- Since
- 0.6.0
◆ URI()
remwharead::URI::URI |
( |
const string & |
uri | ) |
|
|
explicit |
Construct object and set URL.
Initializes TLS and sets proxy from the environment variable http_proxy
, if possible.
- Since
- 0.6.0
67 Poco::Net::initializeSSL();
71 HTTPClientSession::ProxyConfig proxy;
72 const string env_proxy = Environment::get(
"http_proxy");
73 const RegEx re_proxy(
"^(?:https?://)?(?:([^:]+):?([^@]*)@)?" 74 "([^:/]+)(?::([\\d]{1,5}))?/?$");
75 vector<string> matches;
77 if (re_proxy.split(env_proxy, matches) >= 4)
79 proxy.username = matches[1];
80 proxy.password = matches[2];
81 proxy.host = matches[3];
82 if (!matches[4].empty())
84 const std::uint32_t &port = std::stoul(matches[4]);
87 throw std::invalid_argument(
"Port number out of range");
92 HTTPClientSession::setGlobalProxyConfig(proxy);
94 catch (
const Poco::RegularExpressionException &e)
96 cerr <<
"Error: Proxy could not be set (" 97 << e.displayText() <<
")\n";
99 catch (
const std::invalid_argument &e)
101 cerr <<
"Error: " << e.what() << endl;
103 catch (
const std::exception &)
◆ archive()
Save URI in archive and return archive-URI.
- Since
- 0.6.0
608 if (_uri.substr(0, 4) !=
"http")
610 return {
false,
"Only HTTP(S) is archivable.",
"" };
615 const string answer =
make_request(
"https://web.archive.org/save/" 620 return {
true,
"",
"https://web.archive.org" + answer };
623 catch (
const Poco::Exception &e)
625 return {
false, e.displayText(),
"" };
628 return {
false,
"Unknown error.",
"" };
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:139
◆ extract_description()
const string remwharead::URI::extract_description |
( |
const string & |
html | ) |
|
|
protected |
Extract the description from an HTML page.
- Since
- 0.6.0
231 const RegEx re_htmlfile(
".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
232 if (_uri.substr(0, 4) ==
"http" || re_htmlfile.match(_uri))
234 const RegEx re_desc(
"description\"[^>]+content=\"([^\"]+)",
236 vector<string> matches;
237 re_desc.split(html, matches);
238 if (matches.size() >= 2)
const string unescape_html(string html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:306
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:631
◆ extract_title()
const string remwharead::URI::extract_title |
( |
const string & |
html | ) |
|
|
protected |
Extract the title from an HTML page.
- Since
- 0.6.0
214 const RegEx re_htmlfile(
".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
215 if (_uri.substr(0, 4) ==
"http" || re_htmlfile.match(_uri))
217 const RegEx re_title(
"<title>([^<]+)", RegEx::RE_CASELESS);
218 vector<string> matches;
219 re_title.split(html, matches);
220 if (matches.size() >= 2)
const string unescape_html(string html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:306
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:631
◆ get()
Download URI and extract title, description and full text.
- Since
- 0.6.0
131 catch (
const Poco::Exception &e)
133 return {
false, e.displayText(),
"",
"",
"" };
136 return {
false,
"Unknown error.",
"",
"",
"" };
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:247
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:212
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:139
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:229
◆ make_request()
const string remwharead::URI::make_request |
( |
const string & |
uri, |
|
|
bool |
archive = false |
|
) |
| const |
|
protected |
Make a HTTP(S) request.
- Since
- 0.6.0
141 Poco::URI poco_uri(uri);
143 archive ? HTTPRequest::HTTP_HEAD : HTTPRequest::HTTP_GET;
144 string path = poco_uri.getPathAndQuery();
150 unique_ptr<HTTPClientSession> session;
151 if (poco_uri.getScheme() ==
"https")
153 session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
156 else if (poco_uri.getScheme() ==
"http")
158 session = make_unique<HTTPClientSession>(poco_uri.getHost(),
163 throw Poco::Exception(
"Protocol not supported.");
166 HTTPRequest request(method, path, HTTPMessage::HTTP_1_1);
167 request.set(
"User-Agent",
string(
"remwharead/") + global::version);
169 HTTPResponse response;
171 session->sendRequest(request);
172 istream &rs = session->receiveResponse(response);
175 switch (response.getStatus())
183 string location = response.get(
"Location");
184 if (location.substr(0, 4) !=
"http")
186 location = poco_uri.getScheme() +
"://" + poco_uri.getHost()
191 case HTTPResponse::HTTP_OK:
196 answer = response.get(
"Content-Location");
200 StreamCopier::copyToString(rs, answer);
206 throw Poco::Exception(response.getReason());
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:139
const archive_answer archive()
Save URI in archive and return archive-URI.
Definition: uri.cpp:606
◆ remove_html_tags()
const string remwharead::URI::remove_html_tags |
( |
const string & |
html, |
|
|
const string & |
tag = "" |
|
) |
| |
|
protected |
Remove HTML tags.
- Parameters
-
html | HTML page. |
tag | If set, only remove this tag. |
- Since
- 0.6.0
275 while (pos != std::string::npos)
277 size_t startpos = html.find(
'<', pos);
278 size_t endpos = html.find(
'>', startpos);
279 out += html.substr(pos, startpos - pos);
281 if (pos != std::string::npos)
291 while ((pos = out.find(
"<" + tag)) != std::string::npos)
293 size_t endpos = out.find(
"</" + tag, pos);
294 if (endpos == std::string::npos)
298 endpos += 3 + tag.length();
299 out.replace(pos, endpos - pos,
"");
◆ remove_newlines()
const string remwharead::URI::remove_newlines |
( |
string |
text | ) |
|
|
protected |
Replace newlines with spaces.
- Since
- 0.6.0
634 while ((posn = text.find(
'\n', posn)) != std::string::npos)
636 text.replace(posn, 1,
" ");
638 size_t posr = posn - 1;
639 if (text[posr] ==
'\r')
641 text.replace(posr, 1,
" ");
◆ strip_html()
const string remwharead::URI::strip_html |
( |
const string & |
html | ) |
|
|
protected |
Removes HTML tags and superflous spaces from an HTML page.
- Since
- 0.6.0
256 while ((pos = out.find(
"\r", pos)) != std::string::npos)
258 out.replace(pos, 1,
"");
262 RegEx(
"\\s+\n").subst(out,
"\n", RegEx::RE_GLOBAL);
263 RegEx(
"\n{2,}").subst(out,
"\n", RegEx::RE_GLOBAL);
const string unescape_html(string html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:306
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:268
◆ unescape_html()
const string remwharead::URI::unescape_html |
( |
string |
html | ) |
|
|
protected |
Convert HTML entities to UTF-8.
- Since
- 0.6.0
309 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
310 const RegEx re_entity(
"&#(x)?([[:alnum:]]{1,8});");
311 RegEx::MatchVec matches;
312 string::size_type pos = 0;
314 while (re_entity.match(html, pos, matches) != 0)
316 char32_t codepoint = 0;
317 const string number = html.substr(matches[2].offset,
320 if (matches[1].length != 0)
322 codepoint = std::stoi(number,
nullptr, 16);
326 codepoint = std::stoi(number,
nullptr, 10);
328 const string unicode = u8c.to_bytes(codepoint);
329 html.replace(matches[0].offset, matches[0].length, unicode);
330 pos = matches[0].offset + unicode.length();
335 const array<const std::pair<const string, const char32_t>, 258> names =
337 {
"exclamation", 0x0021 },
339 {
"percent", 0x0025 },
350 {
"curren", 0x00A4 },
352 {
"brvbar", 0x00A6 },
363 {
"plusmn", 0x00B1 },
369 {
"middot", 0x00B7 },
374 {
"frac14", 0x00BC },
375 {
"frac12", 0x00BD },
376 {
"frac34", 0x00BE },
377 {
"iquest", 0x00BF },
378 {
"Agrave", 0x00C0 },
379 {
"Aacute", 0x00C1 },
381 {
"Atilde", 0x00C3 },
385 {
"Ccedil", 0x00C7 },
386 {
"Egrave", 0x00C8 },
387 {
"Eacute", 0x00C9 },
390 {
"Igrave", 0x00CC },
391 {
"Iacute", 0x00CD },
395 {
"Ntilde", 0x00D1 },
396 {
"Ograve", 0x00D2 },
397 {
"Oacute", 0x00D3 },
399 {
"Otilde", 0x00D5 },
402 {
"Oslash", 0x00D8 },
403 {
"Ugrave", 0x00D9 },
404 {
"Uacute", 0x00DA },
407 {
"Yacute", 0x00DD },
410 {
"agrave", 0x00E0 },
411 {
"aacute", 0x00E1 },
413 {
"atilde", 0x00E3 },
417 {
"ccedil", 0x00E7 },
418 {
"egrave", 0x00E8 },
419 {
"eacute", 0x00E9 },
422 {
"igrave", 0x00EC },
423 {
"iacute", 0x00ED },
427 {
"ntilde", 0x00F1 },
428 {
"ograve", 0x00F2 },
429 {
"oacute", 0x00F3 },
431 {
"otilde", 0x00F5 },
433 {
"divide", 0x00F7 },
434 {
"oslash", 0x00F8 },
435 {
"ugrave", 0x00F9 },
436 {
"uacute", 0x00FA },
439 {
"yacute", 0x00FD },
444 {
"Scaron", 0x0160 },
445 {
"scaron", 0x0161 },
454 {
"Epsilon", 0x0395 },
460 {
"Lambda", 0x039B },
464 {
"Omicron", 0x039F },
469 {
"Upsilon", 0x03A5 },
478 {
"epsilon", 0x03B5 },
484 {
"lambda", 0x03BB },
488 {
"omicron", 0x03BF },
491 {
"sigmaf", 0x03C2 },
494 {
"upsilon", 0x03C5 },
499 {
"thetasym", 0x03D1 },
504 {
"thinsp", 0x2009 },
511 {
"horbar", 0x2015 },
518 {
"dagger", 0x2020 },
519 {
"Dagger", 0x2021 },
521 {
"hellip", 0x2026 },
522 {
"permil", 0x2030 },
525 {
"lsaquo", 0x2039 },
526 {
"rsaquo", 0x203A },
531 {
"weierp", 0x2118 },
534 {
"alefsym", 0x2135 },
546 {
"forall", 0x2200 },
557 {
"lowast", 0x2217 },
567 {
"there4", 0x2234 },
581 {
"otimes", 0x2297 },
586 {
"lfloor", 0x230A },
587 {
"rfloor", 0x230B },
591 {
"spades", 0x2660 },
593 {
"hearts", 0x2665 },
597 for (
auto &pair : names)
599 const RegEx re(
'&' + pair.first +
';');
600 re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
The documentation for this class was generated from the following files: