Extract the description from an HTML page.
246 const RegEx re_htmlfile(
".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
247 if (_uri.substr(0, 4) ==
"http" || re_htmlfile.match(_uri))
249 const RegEx re_desc(R
"(description"[^>]+content="([^"]+))", 251 vector<string> matches; 252 re_desc.split(html, matches); 253 if (matches.size() >= 2)
271 while ((pos = out.find(
'\r', pos)) != std::string::npos)
273 out.replace(pos, 1,
"");
277 RegEx(
"\\s+\n").subst(out,
"\n", RegEx::RE_GLOBAL);
278 RegEx(
"\n{2,}").subst(out,
"\n", RegEx::RE_GLOBAL);
290 while (pos != std::string::npos)
292 size_t startpos = html.find(
'<', pos);
293 size_t endpos = html.find(
'>', startpos);
294 out += html.substr(pos, startpos - pos);
296 if (pos != std::string::npos)
306 while ((pos = out.find(
"<" + tag)) != std::string::npos)
308 size_t endpos = out.find(
"</" + tag, pos);
309 if (endpos == std::string::npos)
313 endpos += 3 + tag.length();
314 out.replace(pos, endpos - pos,
"");
324 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
325 const RegEx re_entity(
"&#(x)?([[:alnum:]]{1,8});");
326 RegEx::MatchVec matches;
327 string::size_type pos = 0;
329 while (re_entity.match(html, pos, matches) != 0)
331 char32_t codepoint = 0;
332 const string number = html.substr(matches[2].offset, matches[2].length);
334 if (matches[1].length != 0)
336 codepoint = static_cast<char32_t>(std::stoul(number,
nullptr, 16));
340 codepoint = static_cast<char32_t>(std::stoi(number,
nullptr, 10));
342 const string unicode = u8c.to_bytes(codepoint);
343 html.replace(matches[0].offset, matches[0].length, unicode);
344 pos = matches[0].offset + unicode.length();
349 const array<const std::pair<const string, const char32_t>, 258> names =
351 {
"exclamation", 0x0021 },
353 {
"percent", 0x0025 },
364 {
"curren", 0x00A4 },
366 {
"brvbar", 0x00A6 },
377 {
"plusmn", 0x00B1 },
383 {
"middot", 0x00B7 },
388 {
"frac14", 0x00BC },
389 {
"frac12", 0x00BD },
390 {
"frac34", 0x00BE },
391 {
"iquest", 0x00BF },
392 {
"Agrave", 0x00C0 },
393 {
"Aacute", 0x00C1 },
395 {
"Atilde", 0x00C3 },
399 {
"Ccedil", 0x00C7 },
400 {
"Egrave", 0x00C8 },
401 {
"Eacute", 0x00C9 },
404 {
"Igrave", 0x00CC },
405 {
"Iacute", 0x00CD },
409 {
"Ntilde", 0x00D1 },
410 {
"Ograve", 0x00D2 },
411 {
"Oacute", 0x00D3 },
413 {
"Otilde", 0x00D5 },
416 {
"Oslash", 0x00D8 },
417 {
"Ugrave", 0x00D9 },
418 {
"Uacute", 0x00DA },
421 {
"Yacute", 0x00DD },
424 {
"agrave", 0x00E0 },
425 {
"aacute", 0x00E1 },
427 {
"atilde", 0x00E3 },
431 {
"ccedil", 0x00E7 },
432 {
"egrave", 0x00E8 },
433 {
"eacute", 0x00E9 },
436 {
"igrave", 0x00EC },
437 {
"iacute", 0x00ED },
441 {
"ntilde", 0x00F1 },
442 {
"ograve", 0x00F2 },
443 {
"oacute", 0x00F3 },
445 {
"otilde", 0x00F5 },
447 {
"divide", 0x00F7 },
448 {
"oslash", 0x00F8 },
449 {
"ugrave", 0x00F9 },
450 {
"uacute", 0x00FA },
453 {
"yacute", 0x00FD },
458 {
"Scaron", 0x0160 },
459 {
"scaron", 0x0161 },
468 {
"Epsilon", 0x0395 },
474 {
"Lambda", 0x039B },
478 {
"Omicron", 0x039F },
483 {
"Upsilon", 0x03A5 },
492 {
"epsilon", 0x03B5 },
498 {
"lambda", 0x03BB },
502 {
"omicron", 0x03BF },
505 {
"sigmaf", 0x03C2 },
508 {
"upsilon", 0x03C5 },
513 {
"thetasym", 0x03D1 },
518 {
"thinsp", 0x2009 },
525 {
"horbar", 0x2015 },
532 {
"dagger", 0x2020 },
533 {
"Dagger", 0x2021 },
535 {
"hellip", 0x2026 },
536 {
"permil", 0x2030 },
539 {
"lsaquo", 0x2039 },
540 {
"rsaquo", 0x203A },
545 {
"weierp", 0x2118 },
548 {
"alefsym", 0x2135 },
560 {
"forall", 0x2200 },
571 {
"lowast", 0x2217 },
581 {
"there4", 0x2234 },
595 {
"otimes", 0x2297 },
600 {
"lfloor", 0x230A },
601 {
"rfloor", 0x230B },
605 {
"spades", 0x2660 },
607 {
"hearts", 0x2665 },
611 for (
auto &pair : names)
613 const RegEx re(
'&' + pair.first +
';');
614 re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
622 if (_uri.substr(0, 4) !=
"http")
624 return {
false,
"Only HTTP(S) is archivable.",
"" };
629 const string answer =
make_request(
"https://web.archive.org/save/" 634 return {
true,
"",
"https://web.archive.org" + answer };
637 catch (
const Poco::Exception &e)
639 return {
false, e.displayText(),
"" };
642 return {
false,
"Unknown error.",
"" };
string unescape_html(string html) const
Convert HTML entities to UTF-8.
string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:154
string remove_newlines(string text) const
Replace newlines with spaces.
Definition: uri.cpp:645
string remove_html_tags(const string &html, const string &tag="") const
Remove HTML tags.
string strip_html(const string &html) const
Removes HTML tags and superflous spaces from an HTML page.
string cut_text(const string &text, uint16_t n_chars) const
Limits text to N characters, cuts at space.
Definition: uri.cpp:663
archive_answer archive() const
Save URI in archive and return archive-URI.