Extract the description from an HTML page.
242 const RegEx re_htmlfile(
".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
243 if (_uri.substr(0, 4) ==
"http" || re_htmlfile.match(_uri))
245 const RegEx re_desc(R
"(description"[^>]+content="([^"]+))", 247 vector<string> matches; 248 re_desc.split(html, matches); 249 if (matches.size() >= 2)
267 while ((pos = out.find(
'\r', pos)) != std::string::npos)
269 out.replace(pos, 1,
"");
273 RegEx(
"\\s+\n").subst(out,
"\n", RegEx::RE_GLOBAL);
274 RegEx(
"\n{2,}").subst(out,
"\n", RegEx::RE_GLOBAL);
286 while (pos != std::string::npos)
288 size_t startpos = html.find(
'<', pos);
289 size_t endpos = html.find(
'>', startpos);
290 out += html.substr(pos, startpos - pos);
292 if (pos != std::string::npos)
302 while ((pos = out.find(
"<" + tag)) != std::string::npos)
304 size_t endpos = out.find(
"</" + tag, pos);
305 if (endpos == std::string::npos)
309 endpos += 3 + tag.length();
310 out.replace(pos, endpos - pos,
"");
320 std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
321 const RegEx re_entity(
"&#(x)?([[:alnum:]]{1,8});");
322 RegEx::MatchVec matches;
323 string::size_type pos = 0;
325 while (re_entity.match(html, pos, matches) != 0)
327 char32_t codepoint = 0;
328 const string number = html.substr(matches[2].offset, matches[2].length);
330 if (matches[1].length != 0)
332 codepoint = std::stoi(number,
nullptr, 16);
336 codepoint = std::stoi(number,
nullptr, 10);
338 const string unicode = u8c.to_bytes(codepoint);
339 html.replace(matches[0].offset, matches[0].length, unicode);
340 pos = matches[0].offset + unicode.length();
345 const array<const std::pair<const string, const char32_t>, 258> names =
347 {
"exclamation", 0x0021 },
349 {
"percent", 0x0025 },
360 {
"curren", 0x00A4 },
362 {
"brvbar", 0x00A6 },
373 {
"plusmn", 0x00B1 },
379 {
"middot", 0x00B7 },
384 {
"frac14", 0x00BC },
385 {
"frac12", 0x00BD },
386 {
"frac34", 0x00BE },
387 {
"iquest", 0x00BF },
388 {
"Agrave", 0x00C0 },
389 {
"Aacute", 0x00C1 },
391 {
"Atilde", 0x00C3 },
395 {
"Ccedil", 0x00C7 },
396 {
"Egrave", 0x00C8 },
397 {
"Eacute", 0x00C9 },
400 {
"Igrave", 0x00CC },
401 {
"Iacute", 0x00CD },
405 {
"Ntilde", 0x00D1 },
406 {
"Ograve", 0x00D2 },
407 {
"Oacute", 0x00D3 },
409 {
"Otilde", 0x00D5 },
412 {
"Oslash", 0x00D8 },
413 {
"Ugrave", 0x00D9 },
414 {
"Uacute", 0x00DA },
417 {
"Yacute", 0x00DD },
420 {
"agrave", 0x00E0 },
421 {
"aacute", 0x00E1 },
423 {
"atilde", 0x00E3 },
427 {
"ccedil", 0x00E7 },
428 {
"egrave", 0x00E8 },
429 {
"eacute", 0x00E9 },
432 {
"igrave", 0x00EC },
433 {
"iacute", 0x00ED },
437 {
"ntilde", 0x00F1 },
438 {
"ograve", 0x00F2 },
439 {
"oacute", 0x00F3 },
441 {
"otilde", 0x00F5 },
443 {
"divide", 0x00F7 },
444 {
"oslash", 0x00F8 },
445 {
"ugrave", 0x00F9 },
446 {
"uacute", 0x00FA },
449 {
"yacute", 0x00FD },
454 {
"Scaron", 0x0160 },
455 {
"scaron", 0x0161 },
464 {
"Epsilon", 0x0395 },
470 {
"Lambda", 0x039B },
474 {
"Omicron", 0x039F },
479 {
"Upsilon", 0x03A5 },
488 {
"epsilon", 0x03B5 },
494 {
"lambda", 0x03BB },
498 {
"omicron", 0x03BF },
501 {
"sigmaf", 0x03C2 },
504 {
"upsilon", 0x03C5 },
509 {
"thetasym", 0x03D1 },
514 {
"thinsp", 0x2009 },
521 {
"horbar", 0x2015 },
528 {
"dagger", 0x2020 },
529 {
"Dagger", 0x2021 },
531 {
"hellip", 0x2026 },
532 {
"permil", 0x2030 },
535 {
"lsaquo", 0x2039 },
536 {
"rsaquo", 0x203A },
541 {
"weierp", 0x2118 },
544 {
"alefsym", 0x2135 },
556 {
"forall", 0x2200 },
567 {
"lowast", 0x2217 },
577 {
"there4", 0x2234 },
591 {
"otimes", 0x2297 },
596 {
"lfloor", 0x230A },
597 {
"rfloor", 0x230B },
601 {
"spades", 0x2660 },
603 {
"hearts", 0x2665 },
607 for (
auto &pair : names)
609 const RegEx re(
'&' + pair.first +
';');
610 re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
618 if (_uri.substr(0, 4) !=
"http")
620 return {
false,
"Only HTTP(S) is archivable.",
"" };
625 const string answer =
make_request(
"https://web.archive.org/save/" 630 return {
true,
"",
"https://web.archive.org" + answer };
633 catch (
const Poco::Exception &e)
635 return {
false, e.displayText(),
"" };
638 return {
false,
"Unknown error.",
"" };
archive_answer archive()
Save URI in archive and return archive-URI.
string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:151
string unescape_html(string html)
Convert HTML entities to UTF-8.
string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:641
string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.