Download, archive and process an URI.
More...
#include <uri.hpp>
|
| URI (const string &uri) |
| Construct object and set URL. More...
|
|
const html_extract | get () |
| Download URI and extract title, description and full text. More...
|
|
const string | archive () |
| Save URI in archive and return archive-URI. More...
|
|
Download, archive and process an URI.
◆ URI()
remwharead::URI::URI |
( |
const string & |
uri | ) |
|
|
explicit |
Construct object and set URL.
59 Poco::Net::initializeSSL();
63 HTTPClientSession::ProxyConfig proxy;
64 string proxy_env = Environment::get(
"http_proxy");
68 if ((pos = proxy_env.find(
"//")) != string::npos)
70 proxy_env = proxy_env.substr(pos + 2);
72 if ((pos = proxy_env.find(
'/')) != string::npos)
74 proxy_env = proxy_env.substr(0, pos);
76 if ((pos = proxy_env.find(
':')) != string::npos)
78 proxy.host = proxy_env.substr(0, pos);
79 proxy.port = std::stoi(proxy_env.substr(pos + 1));
83 proxy.host = proxy_env;
86 HTTPClientSession::setGlobalProxyConfig(proxy);
88 catch (
const std::exception &)
◆ archive()
const string remwharead::URI::archive |
( |
| ) |
|
Save URI in archive and return archive-URI.
570 if (_uri.substr(0, 4) !=
"http")
577 const string answer =
make_request(
"https://web.archive.org/save/" 581 if (regex_search(answer, match, regex(
"Content-Location: (.+)\r")))
583 return "https://web.archive.org" + match[1].str();
587 cerr <<
"Error: Could not archive page.\n";
590 catch (
const std::exception &e)
592 cerr <<
"Error in " << __func__ <<
": " << e.what() <<
".\n";
const string make_request(const string &uri) const
Make a HTTP(S) request.
Definition: uri.cpp:129
◆ extract_description()
const string remwharead::URI::extract_description |
( |
const string & |
html | ) |
|
|
protected |
Extract the description from an HTML page.
199 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
200 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
203 const regex re(
"description\"[^>]+content=\"([^\"]+)", icase);
204 regex_search(html, match, re);
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:211
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:598
◆ extract_title()
const string remwharead::URI::extract_title |
( |
const string & |
html | ) |
|
|
protected |
Extract the title from an HTML page.
186 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
187 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
190 regex_search(html, match, regex(
"<title>([^<]+)", icase));
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:598
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:268
◆ get()
Download URI and extract title, description and full text.
103 std::ostringstream oss;
108 cerr <<
"Error: Could not download page.\n";
120 catch (
const std::exception &e)
123 cerr <<
"Error in " << __func__ <<
": " << e.what() << endl;
126 return {
"",
"",
"" };
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:211
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:184
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:197
const string make_request(const string &uri) const
Make a HTTP(S) request.
Definition: uri.cpp:129
◆ make_request()
const string remwharead::URI::make_request |
( |
const string & |
uri | ) |
const |
|
protected |
Make a HTTP(S) request.
131 Poco::URI poco_uri(uri);
132 string path = poco_uri.getPathAndQuery();
138 unique_ptr<HTTPClientSession> session;
139 if (poco_uri.getScheme() ==
"https")
141 session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
144 else if (poco_uri.getScheme() ==
"http")
146 session = make_unique<HTTPClientSession>(poco_uri.getHost(),
150 HTTPRequest request(HTTPRequest::HTTP_GET, path,
151 HTTPMessage::HTTP_1_1);
152 request.set(
"User-Agent",
string(
"remwharead/") + global::version);
154 HTTPResponse response;
156 session->sendRequest(request);
157 istream &rs = session->receiveResponse(response);
160 switch (response.getStatus())
170 case HTTPResponse::HTTP_OK:
173 StreamCopier::copyToString(rs, answer);
178 cerr << response.getStatus() <<
" " << response.getReason() << endl;
const string make_request(const string &uri) const
Make a HTTP(S) request.
Definition: uri.cpp:129
◆ remove_html_tags()
const string remwharead::URI::remove_html_tags |
( |
const string & |
html, |
|
|
const string & |
tag = "" |
|
) |
| |
|
protected |
Remove HTML tags.
- Parameters
-
html | HTML page. |
tag | If set, only remove this tag. |
237 while (pos != std::string::npos)
239 size_t startpos = html.find(
'<', pos);
240 size_t endpos = html.find(
'>', startpos);
241 out += html.substr(pos, startpos - pos);
243 if (pos != std::string::npos)
253 while ((pos = out.find(
"<" + tag)) != std::string::npos)
255 size_t endpos = out.find(
"</" + tag, pos);
256 if (endpos == std::string::npos)
260 endpos += 3 + tag.length();
261 out.replace(pos, endpos - pos,
"");
◆ remove_newlines()
const string remwharead::URI::remove_newlines |
( |
string |
text | ) |
|
|
protected |
Replace newlines with spaces.
601 while ((posn = text.find(
'\n', posn)) != std::string::npos)
603 text.replace(posn, 1,
" ");
605 size_t posr = posn - 1;
606 if (text[posr] ==
'\r')
608 text.replace(posr, 1,
" ");
◆ strip_html()
const string remwharead::URI::strip_html |
( |
const string & |
html | ) |
|
|
protected |
Removes HTML tags and superflous spaces from an HTML page.
220 while ((pos = out.find(
"\r", pos)) != std::string::npos)
222 out.replace(pos, 1,
"");
225 out = regex_replace(out, regex(
"\\s+\n"),
"\n");
226 out = regex_replace(out, regex(
"\n{2,}"),
"\n");
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:230
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:268
◆ unescape_html()
const string remwharead::URI::unescape_html |
( |
const string & |
html | ) |
|
|
protected |
Convert HTML entities to UTF-8.
270 string buffer = html;
274 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
275 regex re_entity(
"&#(x)?([[:alnum:]]{1,8});");
278 while (regex_search(buffer, match, re_entity))
280 char32_t codepoint = 0;
282 if (match[1].length() == 1)
284 codepoint = std::stoi(match[2].str(),
nullptr, 16);
288 codepoint = std::stoi(match[2].str(),
nullptr, 10);
290 output += match.prefix().str() + u8c.to_bytes(codepoint);
291 buffer = match.suffix().str();
297 const array<const std::pair<const string, const char32_t>, 258> names =
299 {
"exclamation", 0x0021 },
301 {
"percent", 0x0025 },
312 {
"curren", 0x00A4 },
314 {
"brvbar", 0x00A6 },
325 {
"plusmn", 0x00B1 },
331 {
"middot", 0x00B7 },
336 {
"frac14", 0x00BC },
337 {
"frac12", 0x00BD },
338 {
"frac34", 0x00BE },
339 {
"iquest", 0x00BF },
340 {
"Agrave", 0x00C0 },
341 {
"Aacute", 0x00C1 },
343 {
"Atilde", 0x00C3 },
347 {
"Ccedil", 0x00C7 },
348 {
"Egrave", 0x00C8 },
349 {
"Eacute", 0x00C9 },
352 {
"Igrave", 0x00CC },
353 {
"Iacute", 0x00CD },
357 {
"Ntilde", 0x00D1 },
358 {
"Ograve", 0x00D2 },
359 {
"Oacute", 0x00D3 },
361 {
"Otilde", 0x00D5 },
364 {
"Oslash", 0x00D8 },
365 {
"Ugrave", 0x00D9 },
366 {
"Uacute", 0x00DA },
369 {
"Yacute", 0x00DD },
372 {
"agrave", 0x00E0 },
373 {
"aacute", 0x00E1 },
375 {
"atilde", 0x00E3 },
379 {
"ccedil", 0x00E7 },
380 {
"egrave", 0x00E8 },
381 {
"eacute", 0x00E9 },
384 {
"igrave", 0x00EC },
385 {
"iacute", 0x00ED },
389 {
"ntilde", 0x00F1 },
390 {
"ograve", 0x00F2 },
391 {
"oacute", 0x00F3 },
393 {
"otilde", 0x00F5 },
395 {
"divide", 0x00F7 },
396 {
"oslash", 0x00F8 },
397 {
"ugrave", 0x00F9 },
398 {
"uacute", 0x00FA },
401 {
"yacute", 0x00FD },
406 {
"Scaron", 0x0160 },
407 {
"scaron", 0x0161 },
416 {
"Epsilon", 0x0395 },
422 {
"Lambda", 0x039B },
426 {
"Omicron", 0x039F },
431 {
"Upsilon", 0x03A5 },
440 {
"epsilon", 0x03B5 },
446 {
"lambda", 0x03BB },
450 {
"omicron", 0x03BF },
453 {
"sigmaf", 0x03C2 },
456 {
"upsilon", 0x03C5 },
461 {
"thetasym", 0x03D1 },
466 {
"thinsp", 0x2009 },
473 {
"horbar", 0x2015 },
480 {
"dagger", 0x2020 },
481 {
"Dagger", 0x2021 },
483 {
"hellip", 0x2026 },
484 {
"permil", 0x2030 },
487 {
"lsaquo", 0x2039 },
488 {
"rsaquo", 0x203A },
493 {
"weierp", 0x2118 },
496 {
"alefsym", 0x2135 },
508 {
"forall", 0x2200 },
519 {
"lowast", 0x2217 },
529 {
"there4", 0x2234 },
543 {
"otimes", 0x2297 },
548 {
"lfloor", 0x230A },
549 {
"rfloor", 0x230B },
553 {
"spades", 0x2660 },
555 {
"hearts", 0x2665 },
559 for (
auto &pair : names)
561 const regex re(
'&' + pair.first +
';');
562 output = regex_replace(output, re, u8c.to_bytes(pair.second));
The documentation for this class was generated from the following files: