Download, archive and process an URI.
More...
#include <remwharead/uri.hpp>
Download, archive and process an URI.
- Since
- 0.6.0
◆ URI()
remwharead::URI::URI |
( |
const string & |
uri | ) |
|
|
explicit |
Construct object and set URL.
Initializes TLS and sets proxy from the environment variable http_proxy
, if possible.
- Since
- 0.6.0
67 Poco::Net::initializeSSL();
71 HTTPClientSession::ProxyConfig proxy;
72 string proxy_env = Environment::get(
"http_proxy");
76 if ((pos = proxy_env.find(
"//")) != string::npos)
78 proxy_env = proxy_env.substr(pos + 2);
80 if ((pos = proxy_env.find(
'/')) != string::npos)
82 proxy_env = proxy_env.substr(0, pos);
85 if ((pos = proxy_env.find(
':')) != string::npos)
87 proxy.host = proxy_env.substr(0, pos);
88 proxy.port = std::stoi(proxy_env.substr(pos + 1));
92 proxy.host = proxy_env;
95 HTTPClientSession::setGlobalProxyConfig(proxy);
97 catch (
const std::exception &)
◆ archive()
Save URI in archive and return archive-URI.
- Since
- 0.6.0
589 if (_uri.substr(0, 4) !=
"http")
591 return {
false,
"Only HTTP(S) is archivable.",
"" };
596 const string answer =
make_request(
"https://web.archive.org/save/" 601 return {
true,
"",
"https://web.archive.org" + answer };
604 catch (
const Poco::Exception &e)
606 return {
false, e.displayText(),
"" };
609 return {
false,
"Unknown error.",
"" };
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
◆ extract_description()
const string remwharead::URI::extract_description |
( |
const string & |
html | ) |
|
|
protected |
Extract the description from an HTML page.
- Since
- 0.6.0
217 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
218 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
221 const regex re(
"description\"[^>]+content=\"([^\"]+)", icase);
222 regex_search(html, match, re);
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:229
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:612
◆ extract_title()
const string remwharead::URI::extract_title |
( |
const string & |
html | ) |
|
|
protected |
Extract the title from an HTML page.
- Since
- 0.6.0
204 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
205 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
208 regex_search(html, match, regex(
"<title>([^<]+)", icase));
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:612
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:287
◆ get()
Download URI and extract title, description and full text.
- Since
- 0.6.0
125 catch (
const Poco::Exception &e)
127 return {
false, e.displayText(),
"",
"",
"" };
130 return {
false,
"Unknown error.",
"",
"",
"" };
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:229
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:202
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:215
◆ make_request()
const string remwharead::URI::make_request |
( |
const string & |
uri, |
|
|
bool |
archive = false |
|
) |
| const |
|
protected |
Make a HTTP(S) request.
- Since
- 0.6.0
135 Poco::URI poco_uri(uri);
137 archive ? HTTPRequest::HTTP_HEAD : HTTPRequest::HTTP_GET;
138 string path = poco_uri.getPathAndQuery();
144 unique_ptr<HTTPClientSession> session;
145 if (poco_uri.getScheme() ==
"https")
147 session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
150 else if (poco_uri.getScheme() ==
"http")
152 session = make_unique<HTTPClientSession>(poco_uri.getHost(),
156 HTTPRequest request(method, path, HTTPMessage::HTTP_1_1);
157 request.set(
"User-Agent",
string(
"remwharead/") + global::version);
159 HTTPResponse response;
161 session->sendRequest(request);
162 istream &rs = session->receiveResponse(response);
165 switch (response.getStatus())
173 string location = response.get(
"Location");
174 if (location.substr(0, 4) !=
"http")
176 location = poco_uri.getScheme() +
"://" + poco_uri.getHost()
181 case HTTPResponse::HTTP_OK:
186 answer = response.get(
"Content-Location");
190 StreamCopier::copyToString(rs, answer);
196 throw Poco::Exception(response.getReason());
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
const archive_answer archive()
Save URI in archive and return archive-URI.
Definition: uri.cpp:587
◆ remove_html_tags()
const string remwharead::URI::remove_html_tags |
( |
const string & |
html, |
|
|
const string & |
tag = "" |
|
) |
| |
|
protected |
Remove HTML tags.
- Parameters
-
html | HTML page. |
tag | If set, only remove this tag. |
- Since
- 0.6.0
256 while (pos != std::string::npos)
258 size_t startpos = html.find(
'<', pos);
259 size_t endpos = html.find(
'>', startpos);
260 out += html.substr(pos, startpos - pos);
262 if (pos != std::string::npos)
272 while ((pos = out.find(
"<" + tag)) != std::string::npos)
274 size_t endpos = out.find(
"</" + tag, pos);
275 if (endpos == std::string::npos)
279 endpos += 3 + tag.length();
280 out.replace(pos, endpos - pos,
"");
◆ remove_newlines()
const string remwharead::URI::remove_newlines |
( |
string |
text | ) |
|
|
protected |
Replace newlines with spaces.
- Since
- 0.6.0
615 while ((posn = text.find(
'\n', posn)) != std::string::npos)
617 text.replace(posn, 1,
" ");
619 size_t posr = posn - 1;
620 if (text[posr] ==
'\r')
622 text.replace(posr, 1,
" ");
◆ strip_html()
const string remwharead::URI::strip_html |
( |
const string & |
html | ) |
|
|
protected |
Removes HTML tags and superflous spaces from an HTML page.
- Since
- 0.6.0
238 while ((pos = out.find(
"\r", pos)) != std::string::npos)
240 out.replace(pos, 1,
"");
243 out = regex_replace(out, regex(
"\\s+\n"),
"\n");
244 out = regex_replace(out, regex(
"\n{2,}"),
"\n");
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:249
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:287
◆ unescape_html()
const string remwharead::URI::unescape_html |
( |
const string & |
html | ) |
|
|
protected |
Convert HTML entities to UTF-8.
- Since
- 0.6.0
289 string buffer = html;
293 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
294 regex re_entity(
"&#(x)?([[:alnum:]]{1,8});");
297 while (regex_search(buffer, match, re_entity))
299 char32_t codepoint = 0;
301 if (match[1].length() == 1)
303 codepoint = std::stoi(match[2].str(),
nullptr, 16);
307 codepoint = std::stoi(match[2].str(),
nullptr, 10);
309 output += match.prefix().str() + u8c.to_bytes(codepoint);
310 buffer = match.suffix().str();
316 const array<const std::pair<const string, const char32_t>, 258> names =
318 {
"exclamation", 0x0021 },
320 {
"percent", 0x0025 },
331 {
"curren", 0x00A4 },
333 {
"brvbar", 0x00A6 },
344 {
"plusmn", 0x00B1 },
350 {
"middot", 0x00B7 },
355 {
"frac14", 0x00BC },
356 {
"frac12", 0x00BD },
357 {
"frac34", 0x00BE },
358 {
"iquest", 0x00BF },
359 {
"Agrave", 0x00C0 },
360 {
"Aacute", 0x00C1 },
362 {
"Atilde", 0x00C3 },
366 {
"Ccedil", 0x00C7 },
367 {
"Egrave", 0x00C8 },
368 {
"Eacute", 0x00C9 },
371 {
"Igrave", 0x00CC },
372 {
"Iacute", 0x00CD },
376 {
"Ntilde", 0x00D1 },
377 {
"Ograve", 0x00D2 },
378 {
"Oacute", 0x00D3 },
380 {
"Otilde", 0x00D5 },
383 {
"Oslash", 0x00D8 },
384 {
"Ugrave", 0x00D9 },
385 {
"Uacute", 0x00DA },
388 {
"Yacute", 0x00DD },
391 {
"agrave", 0x00E0 },
392 {
"aacute", 0x00E1 },
394 {
"atilde", 0x00E3 },
398 {
"ccedil", 0x00E7 },
399 {
"egrave", 0x00E8 },
400 {
"eacute", 0x00E9 },
403 {
"igrave", 0x00EC },
404 {
"iacute", 0x00ED },
408 {
"ntilde", 0x00F1 },
409 {
"ograve", 0x00F2 },
410 {
"oacute", 0x00F3 },
412 {
"otilde", 0x00F5 },
414 {
"divide", 0x00F7 },
415 {
"oslash", 0x00F8 },
416 {
"ugrave", 0x00F9 },
417 {
"uacute", 0x00FA },
420 {
"yacute", 0x00FD },
425 {
"Scaron", 0x0160 },
426 {
"scaron", 0x0161 },
435 {
"Epsilon", 0x0395 },
441 {
"Lambda", 0x039B },
445 {
"Omicron", 0x039F },
450 {
"Upsilon", 0x03A5 },
459 {
"epsilon", 0x03B5 },
465 {
"lambda", 0x03BB },
469 {
"omicron", 0x03BF },
472 {
"sigmaf", 0x03C2 },
475 {
"upsilon", 0x03C5 },
480 {
"thetasym", 0x03D1 },
485 {
"thinsp", 0x2009 },
492 {
"horbar", 0x2015 },
499 {
"dagger", 0x2020 },
500 {
"Dagger", 0x2021 },
502 {
"hellip", 0x2026 },
503 {
"permil", 0x2030 },
506 {
"lsaquo", 0x2039 },
507 {
"rsaquo", 0x203A },
512 {
"weierp", 0x2118 },
515 {
"alefsym", 0x2135 },
527 {
"forall", 0x2200 },
538 {
"lowast", 0x2217 },
548 {
"there4", 0x2234 },
562 {
"otimes", 0x2297 },
567 {
"lfloor", 0x230A },
568 {
"rfloor", 0x230B },
572 {
"spades", 0x2660 },
574 {
"hearts", 0x2665 },
578 for (
auto &pair : names)
580 const regex re(
'&' + pair.first +
';');
581 output = regex_replace(output, re, u8c.to_bytes(pair.second));
The documentation for this class was generated from the following files: