Download, archive and process an URI.
More...
#include <uri.hpp>
|
| URI (const string &uri) |
| Construct object and set URL. More...
|
|
const html_extract | get () |
| Download URI and extract title, description and full text. More...
|
|
const string | archive () |
| Save URI in archive and return archive-URI. More...
|
|
Download, archive and process an URI.
◆ URI()
remwharead::URI::URI |
( |
const string & |
uri | ) |
|
|
explicit |
Construct object and set URL.
◆ archive()
const string remwharead::URI::archive |
( |
| ) |
|
Save URI in archive and return archive-URI.
477 if (_uri.substr(0, 4) !=
"http")
484 std::ostringstream oss;
485 curlpp::Easy request;
487 request.setOpt<curlopts::Url>(
"https://web.archive.org/save/" 489 request.setOpt<curlopts::WriteStream>(&oss);
490 request.setOpt<curlopts::NoBody>(
true);
491 request.setOpt<curlpp::options::Header>(
true);
495 const string answer = oss.str();
496 if (regex_search(answer, match, regex(
"Content-Location: (.+)\r")))
498 return "https://web.archive.org" + match[1].str();
502 cerr <<
"Error: Could not archive page. HTTP status: " 503 << curlpp::infos::ResponseCode::get(request) << endl;
506 catch (
const std::exception &e)
508 cerr <<
"Error in " << __func__ <<
": " << e.what() << endl;
void set_curlpp_options(curlpp::Easy &request)
Sets common curlpp options.
Definition: uri.cpp:83
◆ extract_description()
const string remwharead::URI::extract_description |
( |
const string & |
html | ) |
|
|
protected |
Extract the description from an HTML page.
106 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
107 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
110 const regex re(
"description\"[^>]+content=\"([^\"]+)", icase);
111 regex_search(html, match, re);
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:118
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:514
◆ extract_title()
const string remwharead::URI::extract_title |
( |
const string & |
html | ) |
|
|
protected |
Extract the title from an HTML page.
93 const regex re_htmlfile(
"\\.(.?html?|xml|rss)$");
94 if (_uri.substr(0, 4) ==
"http" || regex_search(_uri, re_htmlfile))
97 regex_search(html, match, regex(
"<title>([^<]+)", icase));
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:514
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:175
◆ get()
Download URI and extract title, description and full text.
52 std::ostringstream oss;
55 request.setOpt<curlopts::Url>(_uri);
56 request.setOpt<curlopts::WriteStream>(&oss);
59 const string answer = oss.str();
62 cerr <<
"Error: Could not download page. Response code: " 63 << curlpp::infos::ResponseCode::get(request) << endl;
75 catch (
const std::exception &e)
77 cerr <<
"Error in " << __func__ <<
": " << e.what() << endl;
80 return {
"",
"",
"" };
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:118
void set_curlpp_options(curlpp::Easy &request)
Sets common curlpp options.
Definition: uri.cpp:83
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:91
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:104
◆ remove_html_tags()
const string remwharead::URI::remove_html_tags |
( |
const string & |
html, |
|
|
const string & |
tag = "" |
|
) |
| |
|
protected |
Remove HTML tags.
- Parameters
-
html | HTML page. |
tag | If set, only remove this tag. |
144 while (pos != std::string::npos)
146 size_t startpos = html.find(
'<', pos);
147 size_t endpos = html.find(
'>', startpos);
148 out += html.substr(pos, startpos - pos);
150 if (pos != std::string::npos)
160 while ((pos = out.find(
"<" + tag)) != std::string::npos)
162 size_t endpos = out.find(
"</" + tag, pos);
163 if (endpos == std::string::npos)
167 endpos += 3 + tag.length();
168 out.replace(pos, endpos - pos,
"");
◆ remove_newlines()
const string remwharead::URI::remove_newlines |
( |
string |
text | ) |
|
|
protected |
Replace newlines with spaces.
517 while ((posn = text.find(
'\n', posn)) != std::string::npos)
519 text.replace(posn, 1,
" ");
521 size_t posr = posn - 1;
522 if (text[posr] ==
'\r')
524 text.replace(posr, 1,
" ");
◆ set_curlpp_options()
void remwharead::URI::set_curlpp_options |
( |
curlpp::Easy & |
request | ) |
|
|
protected |
Sets common curlpp options.
85 request.setOpt<curlopts::UserAgent>(string(
"remwharead/")
87 request.setOpt<curlopts::HttpHeader>({
"Connection: close" });
88 request.setOpt<curlopts::FollowLocation>(
true);
◆ strip_html()
const string remwharead::URI::strip_html |
( |
const string & |
html | ) |
|
|
protected |
Removes HTML tags and superflous spaces from an HTML page.
127 while ((pos = out.find(
"\r", pos)) != std::string::npos)
129 out.replace(pos, 1,
"");
132 out = regex_replace(out, regex(
"\\s+\n"),
"\n");
133 out = regex_replace(out, regex(
"\n{2,}"),
"\n");
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:137
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:175
◆ unescape_html()
const string remwharead::URI::unescape_html |
( |
const string & |
html | ) |
|
|
protected |
Convert HTML entities to UTF-8.
177 string buffer = html;
181 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
182 regex re_entity(
"&#(x)?([[:alnum:]]{1,8});");
185 while (regex_search(buffer, match, re_entity))
187 char32_t codepoint = 0;
189 if (match[1].length() == 1)
191 codepoint = std::stoi(match[2].str(),
nullptr, 16);
195 codepoint = std::stoi(match[2].str(),
nullptr, 10);
197 output += match.prefix().str() + u8c.to_bytes(codepoint);
198 buffer = match.suffix().str();
204 const array<const std::pair<const string, const char32_t>, 258> names =
206 {
"exclamation", 0x0021 },
208 {
"percent", 0x0025 },
219 {
"curren", 0x00A4 },
221 {
"brvbar", 0x00A6 },
232 {
"plusmn", 0x00B1 },
238 {
"middot", 0x00B7 },
243 {
"frac14", 0x00BC },
244 {
"frac12", 0x00BD },
245 {
"frac34", 0x00BE },
246 {
"iquest", 0x00BF },
247 {
"Agrave", 0x00C0 },
248 {
"Aacute", 0x00C1 },
250 {
"Atilde", 0x00C3 },
254 {
"Ccedil", 0x00C7 },
255 {
"Egrave", 0x00C8 },
256 {
"Eacute", 0x00C9 },
259 {
"Igrave", 0x00CC },
260 {
"Iacute", 0x00CD },
264 {
"Ntilde", 0x00D1 },
265 {
"Ograve", 0x00D2 },
266 {
"Oacute", 0x00D3 },
268 {
"Otilde", 0x00D5 },
271 {
"Oslash", 0x00D8 },
272 {
"Ugrave", 0x00D9 },
273 {
"Uacute", 0x00DA },
276 {
"Yacute", 0x00DD },
279 {
"agrave", 0x00E0 },
280 {
"aacute", 0x00E1 },
282 {
"atilde", 0x00E3 },
286 {
"ccedil", 0x00E7 },
287 {
"egrave", 0x00E8 },
288 {
"eacute", 0x00E9 },
291 {
"igrave", 0x00EC },
292 {
"iacute", 0x00ED },
296 {
"ntilde", 0x00F1 },
297 {
"ograve", 0x00F2 },
298 {
"oacute", 0x00F3 },
300 {
"otilde", 0x00F5 },
302 {
"divide", 0x00F7 },
303 {
"oslash", 0x00F8 },
304 {
"ugrave", 0x00F9 },
305 {
"uacute", 0x00FA },
308 {
"yacute", 0x00FD },
313 {
"Scaron", 0x0160 },
314 {
"scaron", 0x0161 },
323 {
"Epsilon", 0x0395 },
329 {
"Lambda", 0x039B },
333 {
"Omicron", 0x039F },
338 {
"Upsilon", 0x03A5 },
347 {
"epsilon", 0x03B5 },
353 {
"lambda", 0x03BB },
357 {
"omicron", 0x03BF },
360 {
"sigmaf", 0x03C2 },
363 {
"upsilon", 0x03C5 },
368 {
"thetasym", 0x03D1 },
373 {
"thinsp", 0x2009 },
380 {
"horbar", 0x2015 },
387 {
"dagger", 0x2020 },
388 {
"Dagger", 0x2021 },
390 {
"hellip", 0x2026 },
391 {
"permil", 0x2030 },
394 {
"lsaquo", 0x2039 },
395 {
"rsaquo", 0x203A },
400 {
"weierp", 0x2118 },
403 {
"alefsym", 0x2135 },
415 {
"forall", 0x2200 },
426 {
"lowast", 0x2217 },
436 {
"there4", 0x2234 },
450 {
"otimes", 0x2297 },
455 {
"lfloor", 0x230A },
456 {
"rfloor", 0x230B },
460 {
"spades", 0x2660 },
462 {
"hearts", 0x2665 },
466 for (
auto &pair : names)
468 const regex re(
'&' + pair.first +
';');
469 output = regex_replace(output, re, u8c.to_bytes(pair.second));
The documentation for this class was generated from the following files: