Download, archive and process an URI.
More...
#include <remwharead/uri.hpp>
Download, archive and process an URI.
- Since
- 0.6.0
◆ URI()
remwharead::URI::URI |
( |
const string & |
uri | ) |
|
|
explicit |
Construct object and set URL.
Initializes TLS and sets proxy from the environment variable http_proxy
, if possible.
- Since
- 0.6.0
67 Poco::Net::initializeSSL();
71 HTTPClientSession::ProxyConfig proxy;
72 string proxy_env = Environment::get(
"http_proxy");
76 if ((pos = proxy_env.find(
"//")) != string::npos)
78 proxy_env = proxy_env.substr(pos + 2);
80 if ((pos = proxy_env.find(
'/')) != string::npos)
82 proxy_env = proxy_env.substr(0, pos);
85 if ((pos = proxy_env.find(
':')) != string::npos)
87 proxy.host = proxy_env.substr(0, pos);
88 proxy.port = std::stoi(proxy_env.substr(pos + 1));
92 proxy.host = proxy_env;
95 HTTPClientSession::setGlobalProxyConfig(proxy);
97 catch (
const std::exception &)
◆ archive()
Save URI in archive and return archive-URI.
- Since
- 0.6.0
593 if (_uri.substr(0, 4) !=
"http")
595 return {
false,
"Only HTTP(S) is archivable.",
"" };
600 const string answer =
make_request(
"https://web.archive.org/save/" 605 return {
true,
"",
"https://web.archive.org" + answer };
608 catch (
const Poco::Exception &e)
610 return {
false, e.displayText(),
"" };
613 return {
false,
"Unknown error.",
"" };
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
◆ extract_description()
const string remwharead::URI::extract_description |
( |
const string & |
html | ) |
|
|
protected |
Extract the description from an HTML page.
- Since
- 0.6.0
221 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
222 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
225 const regex re(
"description\"[^>]+content=\"([^\"]+)", icase);
226 regex_search(html, match, re);
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:233
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:616
◆ extract_title()
const string remwharead::URI::extract_title |
( |
const string & |
html | ) |
|
|
protected |
Extract the title from an HTML page.
- Since
- 0.6.0
208 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
209 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
212 regex_search(html, match, regex(
"<title>([^<]+)", icase));
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:616
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:291
◆ get()
Download URI and extract title, description and full text.
- Since
- 0.6.0
125 catch (
const Poco::Exception &e)
127 return {
false, e.displayText(),
"",
"",
"" };
130 return {
false,
"Unknown error.",
"",
"",
"" };
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:233
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:206
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:219
◆ make_request()
const string remwharead::URI::make_request |
( |
const string & |
uri, |
|
|
bool |
archive = false |
|
) |
| const |
|
protected |
Make a HTTP(S) request.
- Since
- 0.6.0
135 Poco::URI poco_uri(uri);
137 archive ? HTTPRequest::HTTP_HEAD : HTTPRequest::HTTP_GET;
138 string path = poco_uri.getPathAndQuery();
144 unique_ptr<HTTPClientSession> session;
145 if (poco_uri.getScheme() ==
"https")
147 session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
150 else if (poco_uri.getScheme() ==
"http")
152 session = make_unique<HTTPClientSession>(poco_uri.getHost(),
157 throw Poco::Exception(
"Protocol not supported.");
160 HTTPRequest request(method, path, HTTPMessage::HTTP_1_1);
161 request.set(
"User-Agent",
string(
"remwharead/") + global::version);
163 HTTPResponse response;
165 session->sendRequest(request);
166 istream &rs = session->receiveResponse(response);
169 switch (response.getStatus())
177 string location = response.get(
"Location");
178 if (location.substr(0, 4) !=
"http")
180 location = poco_uri.getScheme() +
"://" + poco_uri.getHost()
185 case HTTPResponse::HTTP_OK:
190 answer = response.get(
"Content-Location");
194 StreamCopier::copyToString(rs, answer);
200 throw Poco::Exception(response.getReason());
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
const archive_answer archive()
Save URI in archive and return archive-URI.
Definition: uri.cpp:591
◆ remove_html_tags()
const string remwharead::URI::remove_html_tags |
( |
const string & |
html, |
|
|
const string & |
tag = "" |
|
) |
| |
|
protected |
Remove HTML tags.
- Parameters
-
html | HTML page. |
tag | If set, only remove this tag. |
- Since
- 0.6.0
260 while (pos != std::string::npos)
262 size_t startpos = html.find(
'<', pos);
263 size_t endpos = html.find(
'>', startpos);
264 out += html.substr(pos, startpos - pos);
266 if (pos != std::string::npos)
276 while ((pos = out.find(
"<" + tag)) != std::string::npos)
278 size_t endpos = out.find(
"</" + tag, pos);
279 if (endpos == std::string::npos)
283 endpos += 3 + tag.length();
284 out.replace(pos, endpos - pos,
"");
◆ remove_newlines()
const string remwharead::URI::remove_newlines |
( |
string |
text | ) |
|
|
protected |
Replace newlines with spaces.
- Since
- 0.6.0
619 while ((posn = text.find(
'\n', posn)) != std::string::npos)
621 text.replace(posn, 1,
" ");
623 size_t posr = posn - 1;
624 if (text[posr] ==
'\r')
626 text.replace(posr, 1,
" ");
◆ strip_html()
const string remwharead::URI::strip_html |
( |
const string & |
html | ) |
|
|
protected |
Removes HTML tags and superflous spaces from an HTML page.
- Since
- 0.6.0
242 while ((pos = out.find(
"\r", pos)) != std::string::npos)
244 out.replace(pos, 1,
"");
247 out = regex_replace(out, regex(
"\\s+\n"),
"\n");
248 out = regex_replace(out, regex(
"\n{2,}"),
"\n");
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:253
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:291
◆ unescape_html()
const string remwharead::URI::unescape_html |
( |
const string & |
html | ) |
|
|
protected |
Convert HTML entities to UTF-8.
- Since
- 0.6.0
293 string buffer = html;
297 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
298 regex re_entity(
"&#(x)?([[:alnum:]]{1,8});");
301 while (regex_search(buffer, match, re_entity))
303 char32_t codepoint = 0;
305 if (match[1].length() == 1)
307 codepoint = std::stoi(match[2].str(),
nullptr, 16);
311 codepoint = std::stoi(match[2].str(),
nullptr, 10);
313 output += match.prefix().str() + u8c.to_bytes(codepoint);
314 buffer = match.suffix().str();
320 const array<const std::pair<const string, const char32_t>, 258> names =
322 {
"exclamation", 0x0021 },
324 {
"percent", 0x0025 },
335 {
"curren", 0x00A4 },
337 {
"brvbar", 0x00A6 },
348 {
"plusmn", 0x00B1 },
354 {
"middot", 0x00B7 },
359 {
"frac14", 0x00BC },
360 {
"frac12", 0x00BD },
361 {
"frac34", 0x00BE },
362 {
"iquest", 0x00BF },
363 {
"Agrave", 0x00C0 },
364 {
"Aacute", 0x00C1 },
366 {
"Atilde", 0x00C3 },
370 {
"Ccedil", 0x00C7 },
371 {
"Egrave", 0x00C8 },
372 {
"Eacute", 0x00C9 },
375 {
"Igrave", 0x00CC },
376 {
"Iacute", 0x00CD },
380 {
"Ntilde", 0x00D1 },
381 {
"Ograve", 0x00D2 },
382 {
"Oacute", 0x00D3 },
384 {
"Otilde", 0x00D5 },
387 {
"Oslash", 0x00D8 },
388 {
"Ugrave", 0x00D9 },
389 {
"Uacute", 0x00DA },
392 {
"Yacute", 0x00DD },
395 {
"agrave", 0x00E0 },
396 {
"aacute", 0x00E1 },
398 {
"atilde", 0x00E3 },
402 {
"ccedil", 0x00E7 },
403 {
"egrave", 0x00E8 },
404 {
"eacute", 0x00E9 },
407 {
"igrave", 0x00EC },
408 {
"iacute", 0x00ED },
412 {
"ntilde", 0x00F1 },
413 {
"ograve", 0x00F2 },
414 {
"oacute", 0x00F3 },
416 {
"otilde", 0x00F5 },
418 {
"divide", 0x00F7 },
419 {
"oslash", 0x00F8 },
420 {
"ugrave", 0x00F9 },
421 {
"uacute", 0x00FA },
424 {
"yacute", 0x00FD },
429 {
"Scaron", 0x0160 },
430 {
"scaron", 0x0161 },
439 {
"Epsilon", 0x0395 },
445 {
"Lambda", 0x039B },
449 {
"Omicron", 0x039F },
454 {
"Upsilon", 0x03A5 },
463 {
"epsilon", 0x03B5 },
469 {
"lambda", 0x03BB },
473 {
"omicron", 0x03BF },
476 {
"sigmaf", 0x03C2 },
479 {
"upsilon", 0x03C5 },
484 {
"thetasym", 0x03D1 },
489 {
"thinsp", 0x2009 },
496 {
"horbar", 0x2015 },
503 {
"dagger", 0x2020 },
504 {
"Dagger", 0x2021 },
506 {
"hellip", 0x2026 },
507 {
"permil", 0x2030 },
510 {
"lsaquo", 0x2039 },
511 {
"rsaquo", 0x203A },
516 {
"weierp", 0x2118 },
519 {
"alefsym", 0x2135 },
531 {
"forall", 0x2200 },
542 {
"lowast", 0x2217 },
552 {
"there4", 0x2234 },
566 {
"otimes", 0x2297 },
571 {
"lfloor", 0x230A },
572 {
"rfloor", 0x230B },
576 {
"spades", 0x2660 },
578 {
"hearts", 0x2665 },
582 for (
auto &pair : names)
584 const regex re(
'&' + pair.first +
';');
585 output = regex_replace(output, re, u8c.to_bytes(pair.second));
The documentation for this class was generated from the following files: