remwharead  0.6.3
Public Member Functions | Protected Member Functions | Protected Attributes | List of all members
remwharead::URI Class Reference

Download, archive and process an URI. More...

#include <uri.hpp>

Public Member Functions

 URI (const string &uri)
 Construct object and set URL. More...
 
const html_extract get ()
 Download URI and extract title, description and full text. More...
 
const string archive ()
 Save URI in archive and return archive-URI. More...
 

Protected Member Functions

void set_curlpp_options (curlpp::Easy &request)
 Sets common curlpp options. More...
 
const string extract_title (const string &html)
 Extract the title from an HTML page. More...
 
const string extract_description (const string &html)
 Extract the description from an HTML page. More...
 
const string strip_html (const string &html)
 Removes HTML tags and superflous spaces from an HTML page. More...
 
const string remove_html_tags (const string &html, const string &tag="")
 Remove HTML tags. More...
 
const string unescape_html (const string &html)
 Convert HTML entities to UTF-8. More...
 
const string remove_newlines (string text)
 Replace newlines with spaces. More...
 

Protected Attributes

string _uri
 

Detailed Description

Download, archive and process an URI.

Constructor & Destructor Documentation

◆ URI()

remwharead::URI::URI ( const string &  uri)
explicit

Construct object and set URL.

44  :_uri(uri)
45  {
46  }

Member Function Documentation

◆ archive()

const string remwharead::URI::archive ( )

Save URI in archive and return archive-URI.

476  {
477  if (_uri.substr(0, 4) != "http")
478  {
479  return "";
480  }
481 
482  try
483  {
484  std::ostringstream oss;
485  curlpp::Easy request;
486  set_curlpp_options(request);
487  request.setOpt<curlopts::Url>("https://web.archive.org/save/"
488  + _uri);
489  request.setOpt<curlopts::WriteStream>(&oss);
490  request.setOpt<curlopts::NoBody>(true); // Make HEAD request.
491  request.setOpt<curlpp::options::Header>(true); // Keep headers.
492  request.perform();
493 
494  smatch match;
495  const string answer = oss.str();
496  if (regex_search(answer, match, regex("Content-Location: (.+)\r")))
497  {
498  return "https://web.archive.org" + match[1].str();
499  }
500  else
501  {
502  cerr << "Error: Could not archive page. HTTP status: "
503  << curlpp::infos::ResponseCode::get(request) << endl;
504  }
505  }
506  catch (const std::exception &e)
507  {
508  cerr << "Error in " << __func__ << ": " << e.what() << endl;
509  }
510 
511  return "";
512  }
void set_curlpp_options(curlpp::Easy &request)
Sets common curlpp options.
Definition: uri.cpp:83

◆ extract_description()

const string remwharead::URI::extract_description ( const string &  html)
protected

Extract the description from an HTML page.

105  {
106  const regex re_htmlfile("\\.(.?html?|xml|rss)$");
107  if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile))
108  {
109  smatch match;
110  const regex re("description\"[^>]+content=\"([^\"]+)", icase);
111  regex_search(html, match, re);
112  return remove_newlines(strip_html(match[1].str()));
113  }
114 
115  return "";
116  }
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:118
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:514

◆ extract_title()

const string remwharead::URI::extract_title ( const string &  html)
protected

Extract the title from an HTML page.

92  {
93  const regex re_htmlfile("\\.(.?html?|xml|rss)$");
94  if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile))
95  {
96  smatch match;
97  regex_search(html, match, regex("<title>([^<]+)", icase));
98  return remove_newlines(unescape_html(match[1].str()));
99  }
100 
101  return "";
102  }
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:514
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:175

◆ get()

const html_extract remwharead::URI::get ( )

Download URI and extract title, description and full text.

49  {
50  try
51  {
52  std::ostringstream oss;
53  curlpp::Easy request;
54  set_curlpp_options(request);
55  request.setOpt<curlopts::Url>(_uri);
56  request.setOpt<curlopts::WriteStream>(&oss);
57  request.perform();
58 
59  const string answer = oss.str();
60  if (answer.empty())
61  {
62  cerr << "Error: Could not download page. Response code: "
63  << curlpp::infos::ResponseCode::get(request) << endl;
64  }
65  else
66  {
67  return
68  {
69  extract_title(answer),
70  extract_description(answer),
71  strip_html(answer)
72  };
73  }
74  }
75  catch (const std::exception &e)
76  {
77  cerr << "Error in " << __func__ << ": " << e.what() << endl;
78  }
79 
80  return { "", "", "" };
81  }
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:118
void set_curlpp_options(curlpp::Easy &request)
Sets common curlpp options.
Definition: uri.cpp:83
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:91
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:104

◆ remove_html_tags()

const string remwharead::URI::remove_html_tags ( const string &  html,
const string &  tag = "" 
)
protected

Remove HTML tags.

Parameters
htmlHTML page.
tagIf set, only remove this tag.
138  {
139  // NOTE: I did this with regex_replace before, but libstdc++ segfaulted.
140  string out;
141  if (tag.empty())
142  {
143  size_t pos = 0;
144  while (pos != std::string::npos)
145  {
146  size_t startpos = html.find('<', pos);
147  size_t endpos = html.find('>', startpos);
148  out += html.substr(pos, startpos - pos);
149  pos = endpos;
150  if (pos != std::string::npos)
151  {
152  ++pos;
153  }
154  }
155  }
156  else
157  {
158  size_t pos = 0;
159  out = html;
160  while ((pos = out.find("<" + tag)) != std::string::npos)
161  {
162  size_t endpos = out.find("</" + tag, pos);
163  if (endpos == std::string::npos)
164  {
165  break;
166  }
167  endpos += 3 + tag.length(); // tag + </ + >
168  out.replace(pos, endpos - pos, "");
169  }
170  }
171 
172  return out;
173  }

◆ remove_newlines()

const string remwharead::URI::remove_newlines ( string  text)
protected

Replace newlines with spaces.

515  {
516  size_t posn = 0;
517  while ((posn = text.find('\n', posn)) != std::string::npos)
518  {
519  text.replace(posn, 1, " ");
520 
521  size_t posr = posn - 1;
522  if (text[posr] == '\r')
523  {
524  text.replace(posr, 1, " ");
525  }
526  ++posn;
527  }
528 
529  return text;
530  }

◆ set_curlpp_options()

void remwharead::URI::set_curlpp_options ( curlpp::Easy &  request)
protected

Sets common curlpp options.

84  {
85  request.setOpt<curlopts::UserAgent>(string("remwharead/")
86  + global::version);
87  request.setOpt<curlopts::HttpHeader>({ "Connection: close" });
88  request.setOpt<curlopts::FollowLocation>(true);
89  }

◆ strip_html()

const string remwharead::URI::strip_html ( const string &  html)
protected

Removes HTML tags and superflous spaces from an HTML page.

119  {
120  string out;
121 
122  out = remove_html_tags(html, "script"); // Remove JavaScript.
123  out = remove_html_tags(out, "style"); // Remove CSS.
124  out = remove_html_tags(out); // Remove tags.
125 
126  size_t pos = 0;
127  while ((pos = out.find("\r", pos)) != std::string::npos) // Remove CR.
128  {
129  out.replace(pos, 1, "");
130  }
131 
132  out = regex_replace(out, regex("\\s+\n"), "\n"); // Remove space at eol.
133  out = regex_replace(out, regex("\n{2,}"), "\n"); // Reduce newlines.
134 
135  return unescape_html(out);
136  }
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:137
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:175

◆ unescape_html()

const string remwharead::URI::unescape_html ( const string &  html)
protected

Convert HTML entities to UTF-8.

176  {
177  string buffer = html;
178  string output;
179 
180  // Used to convert int to utf-8 char.
181  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
182  regex re_entity("&#(x)?([[:alnum:]]{1,8});");
183  smatch match;
184 
185  while (regex_search(buffer, match, re_entity))
186  {
187  char32_t codepoint = 0;
188  // 'x' in front of the number means it's hexadecimal, else decimal.
189  if (match[1].length() == 1)
190  {
191  codepoint = std::stoi(match[2].str(), nullptr, 16);
192  }
193  else
194  {
195  codepoint = std::stoi(match[2].str(), nullptr, 10);
196  }
197  output += match.prefix().str() + u8c.to_bytes(codepoint);
198  buffer = match.suffix().str();
199  }
200  output += buffer;
201 
202  // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
203  // entity_references#Character_entity_references_in_HTML
204  const array<const std::pair<const string, const char32_t>, 258> names =
205  {{
206  { "exclamation", 0x0021 },
207  { "quot", 0x0022 },
208  { "percent", 0x0025 },
209  { "amp", 0x0026 },
210  { "apos", 0x0027 },
211  { "add", 0x002B },
212  { "lt", 0x003C },
213  { "equal", 0x003D },
214  { "gt", 0x003E },
215  { "nbsp", 0x00A0 },
216  { "iexcl", 0x00A1 },
217  { "cent", 0x00A2 },
218  { "pound", 0x00A3 },
219  { "curren", 0x00A4 },
220  { "yen", 0x00A5 },
221  { "brvbar", 0x00A6 },
222  { "sect", 0x00A7 },
223  { "uml", 0x00A8 },
224  { "copy", 0x00A9 },
225  { "ordf", 0x00AA },
226  { "laquo", 0x00AB },
227  { "not", 0x00AC },
228  { "shy", 0x00AD },
229  { "reg", 0x00AE },
230  { "macr", 0x00AF },
231  { "deg", 0x00B0 },
232  { "plusmn", 0x00B1 },
233  { "sup2", 0x00B2 },
234  { "sup3", 0x00B3 },
235  { "acute", 0x00B4 },
236  { "micro", 0x00B5 },
237  { "para", 0x00B6 },
238  { "middot", 0x00B7 },
239  { "cedil", 0x00B8 },
240  { "sup1", 0x00B9 },
241  { "ordm", 0x00BA },
242  { "raquo", 0x00BB },
243  { "frac14", 0x00BC },
244  { "frac12", 0x00BD },
245  { "frac34", 0x00BE },
246  { "iquest", 0x00BF },
247  { "Agrave", 0x00C0 },
248  { "Aacute", 0x00C1 },
249  { "Acirc", 0x00C2 },
250  { "Atilde", 0x00C3 },
251  { "Auml", 0x00C4 },
252  { "Aring", 0x00C5 },
253  { "AElig", 0x00C6 },
254  { "Ccedil", 0x00C7 },
255  { "Egrave", 0x00C8 },
256  { "Eacute", 0x00C9 },
257  { "Ecirc", 0x00CA },
258  { "Euml", 0x00CB },
259  { "Igrave", 0x00CC },
260  { "Iacute", 0x00CD },
261  { "Icirc", 0x00CE },
262  { "Iuml", 0x00CF },
263  { "ETH", 0x00D0 },
264  { "Ntilde", 0x00D1 },
265  { "Ograve", 0x00D2 },
266  { "Oacute", 0x00D3 },
267  { "Ocirc", 0x00D4 },
268  { "Otilde", 0x00D5 },
269  { "Ouml", 0x00D6 },
270  { "times", 0x00D7 },
271  { "Oslash", 0x00D8 },
272  { "Ugrave", 0x00D9 },
273  { "Uacute", 0x00DA },
274  { "Ucirc", 0x00DB },
275  { "Uuml", 0x00DC },
276  { "Yacute", 0x00DD },
277  { "THORN", 0x00DE },
278  { "szlig", 0x00DF },
279  { "agrave", 0x00E0 },
280  { "aacute", 0x00E1 },
281  { "acirc", 0x00E2 },
282  { "atilde", 0x00E3 },
283  { "auml", 0x00E4 },
284  { "aring", 0x00E5 },
285  { "aelig", 0x00E6 },
286  { "ccedil", 0x00E7 },
287  { "egrave", 0x00E8 },
288  { "eacute", 0x00E9 },
289  { "ecirc", 0x00EA },
290  { "euml", 0x00EB },
291  { "igrave", 0x00EC },
292  { "iacute", 0x00ED },
293  { "icirc", 0x00EE },
294  { "iuml", 0x00EF },
295  { "eth", 0x00F0 },
296  { "ntilde", 0x00F1 },
297  { "ograve", 0x00F2 },
298  { "oacute", 0x00F3 },
299  { "ocirc", 0x00F4 },
300  { "otilde", 0x00F5 },
301  { "ouml", 0x00F6 },
302  { "divide", 0x00F7 },
303  { "oslash", 0x00F8 },
304  { "ugrave", 0x00F9 },
305  { "uacute", 0x00FA },
306  { "ucirc", 0x00FB },
307  { "uuml", 0x00FC },
308  { "yacute", 0x00FD },
309  { "thorn", 0x00FE },
310  { "yuml", 0x00FF },
311  { "OElig", 0x0152 },
312  { "oelig", 0x0153 },
313  { "Scaron", 0x0160 },
314  { "scaron", 0x0161 },
315  { "Yuml", 0x0178 },
316  { "fnof", 0x0192 },
317  { "circ", 0x02C6 },
318  { "tilde", 0x02DC },
319  { "Alpha", 0x0391 },
320  { "Beta", 0x0392 },
321  { "Gamma", 0x0393 },
322  { "Delta", 0x0394 },
323  { "Epsilon", 0x0395 },
324  { "Zeta", 0x0396 },
325  { "Eta", 0x0397 },
326  { "Theta", 0x0398 },
327  { "Iota", 0x0399 },
328  { "Kappa", 0x039A },
329  { "Lambda", 0x039B },
330  { "Mu", 0x039C },
331  { "Nu", 0x039D },
332  { "Xi", 0x039E },
333  { "Omicron", 0x039F },
334  { "Pi", 0x03A0 },
335  { "Rho", 0x03A1 },
336  { "Sigma", 0x03A3 },
337  { "Tau", 0x03A4 },
338  { "Upsilon", 0x03A5 },
339  { "Phi", 0x03A6 },
340  { "Chi", 0x03A7 },
341  { "Psi", 0x03A8 },
342  { "Omega", 0x03A9 },
343  { "alpha", 0x03B1 },
344  { "beta", 0x03B2 },
345  { "gamma", 0x03B3 },
346  { "delta", 0x03B4 },
347  { "epsilon", 0x03B5 },
348  { "zeta", 0x03B6 },
349  { "eta", 0x03B7 },
350  { "theta", 0x03B8 },
351  { "iota", 0x03B9 },
352  { "kappa", 0x03BA },
353  { "lambda", 0x03BB },
354  { "mu", 0x03BC },
355  { "nu", 0x03BD },
356  { "xi", 0x03BE },
357  { "omicron", 0x03BF },
358  { "pi", 0x03C0 },
359  { "rho", 0x03C1 },
360  { "sigmaf", 0x03C2 },
361  { "sigma", 0x03C3 },
362  { "tau", 0x03C4 },
363  { "upsilon", 0x03C5 },
364  { "phi", 0x03C6 },
365  { "chi", 0x03C7 },
366  { "psi", 0x03C8 },
367  { "omega", 0x03C9 },
368  { "thetasym", 0x03D1 },
369  { "upsih", 0x03D2 },
370  { "piv", 0x03D6 },
371  { "ensp", 0x2002 },
372  { "emsp", 0x2003 },
373  { "thinsp", 0x2009 },
374  { "zwnj", 0x200C },
375  { "zwj", 0x200D },
376  { "lrm", 0x200E },
377  { "rlm", 0x200F },
378  { "ndash", 0x2013 },
379  { "mdash", 0x2014 },
380  { "horbar", 0x2015 },
381  { "lsquo", 0x2018 },
382  { "rsquo", 0x2019 },
383  { "sbquo", 0x201A },
384  { "ldquo", 0x201C },
385  { "rdquo", 0x201D },
386  { "bdquo", 0x201E },
387  { "dagger", 0x2020 },
388  { "Dagger", 0x2021 },
389  { "bull", 0x2022 },
390  { "hellip", 0x2026 },
391  { "permil", 0x2030 },
392  { "prime", 0x2032 },
393  { "Prime", 0x2033 },
394  { "lsaquo", 0x2039 },
395  { "rsaquo", 0x203A },
396  { "oline", 0x203E },
397  { "frasl", 0x2044 },
398  { "euro", 0x20AC },
399  { "image", 0x2111 },
400  { "weierp", 0x2118 },
401  { "real", 0x211C },
402  { "trade", 0x2122 },
403  { "alefsym", 0x2135 },
404  { "larr", 0x2190 },
405  { "uarr", 0x2191 },
406  { "rarr", 0x2192 },
407  { "darr", 0x2193 },
408  { "harr", 0x2194 },
409  { "crarr", 0x21B5 },
410  { "lArr", 0x21D0 },
411  { "uArr", 0x21D1 },
412  { "rArr", 0x21D2 },
413  { "dArr", 0x21D3 },
414  { "hArr", 0x21D4 },
415  { "forall", 0x2200 },
416  { "part", 0x2202 },
417  { "exist", 0x2203 },
418  { "empty", 0x2205 },
419  { "nabla", 0x2207 },
420  { "isin", 0x2208 },
421  { "notin", 0x2209 },
422  { "ni", 0x220B },
423  { "prod", 0x220F },
424  { "sum", 0x2211 },
425  { "minus", 0x2212 },
426  { "lowast", 0x2217 },
427  { "radic", 0x221A },
428  { "prop", 0x221D },
429  { "infin", 0x221E },
430  { "ang", 0x2220 },
431  { "and", 0x2227 },
432  { "or", 0x2228 },
433  { "cap", 0x2229 },
434  { "cup", 0x222A },
435  { "int", 0x222B },
436  { "there4", 0x2234 },
437  { "sim", 0x223C },
438  { "cong", 0x2245 },
439  { "asymp", 0x2248 },
440  { "ne", 0x2260 },
441  { "equiv", 0x2261 },
442  { "le", 0x2264 },
443  { "ge", 0x2265 },
444  { "sub", 0x2282 },
445  { "sup", 0x2283 },
446  { "nsub", 0x2284 },
447  { "sube", 0x2286 },
448  { "supe", 0x2287 },
449  { "oplus", 0x2295 },
450  { "otimes", 0x2297 },
451  { "perp", 0x22A5 },
452  { "sdot", 0x22C5 },
453  { "lceil", 0x2308 },
454  { "rceil", 0x2309 },
455  { "lfloor", 0x230A },
456  { "rfloor", 0x230B },
457  { "lang", 0x2329 },
458  { "rang", 0x232A },
459  { "loz", 0x25CA },
460  { "spades", 0x2660 },
461  { "clubs", 0x2663 },
462  { "hearts", 0x2665 },
463  { "diams", 0x2666 }
464  }};
465 
466  for (auto &pair : names)
467  {
468  const regex re('&' + pair.first + ';');
469  output = regex_replace(output, re, u8c.to_bytes(pair.second));
470  }
471 
472  return output;
473  }

The documentation for this class was generated from the following files: