2019-07-27 22:46:58 +02:00
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
< html xmlns = "http://www.w3.org/1999/xhtml" >
< head >
< meta http-equiv = "Content-Type" content = "text/xhtml;charset=UTF-8" / >
< meta http-equiv = "X-UA-Compatible" content = "IE=9" / >
< meta name = "generator" content = "Doxygen 1.8.14" / >
< meta name = "viewport" content = "width=device-width, initial-scale=1" / >
< title > remwharead: remwharead::URI Class Reference< / title >
< link href = "tabs.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "jquery.js" > < / script >
< script type = "text/javascript" src = "dynsections.js" > < / script >
< link href = "search/search.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "search/searchdata.js" > < / script >
< script type = "text/javascript" src = "search/search.js" > < / script >
< link href = "doxygen.css" rel = "stylesheet" type = "text/css" / >
< / head >
< body >
< div id = "top" > <!-- do not remove this div, it is closed by doxygen! -->
< div id = "titlearea" >
< table cellspacing = "0" cellpadding = "0" >
< tbody >
< tr style = "height: 56px;" >
< td id = "projectalign" style = "padding-left: 0.5em;" >
< div id = "projectname" > remwharead
  < span id = "projectnumber" > 0.6.3< / span >
< / div >
< / td >
< / tr >
< / tbody >
< / table >
< / div >
<!-- end header part -->
<!-- Generated by Doxygen 1.8.14 -->
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3& dn=gpl-2.0.txt GPL-v2 */
var searchBox = new SearchBox("searchBox", "search",false,'Search');
/* @license-end */
< / script >
< script type = "text/javascript" src = "menudata.js" > < / script >
< script type = "text/javascript" src = "menu.js" > < / script >
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3& dn=gpl-2.0.txt GPL-v2 */
$(function() {
initMenu('',true,false,'search.php','Search');
$(document).ready(function() { init_search(); });
});
/* @license-end */< / script >
< div id = "main-nav" > < / div >
<!-- window showing the filter options -->
< div id = "MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
< / div >
<!-- iframe showing the search results (closed by default) -->
< div id = "MSearchResultsWindow" >
< iframe src = "javascript:void(0)" frameborder = "0"
name="MSearchResults" id="MSearchResults">
< / iframe >
< / div >
< div id = "nav-path" class = "navpath" >
< ul >
< li class = "navelem" > < b > remwharead< / b > < / li > < li class = "navelem" > < a class = "el" href = "classremwharead_1_1URI.html" > URI< / a > < / li > < / ul >
< / div >
< / div > <!-- top -->
< div class = "header" >
< div class = "summary" >
< a href = "#pub-methods" > Public Member Functions< / a > |
< a href = "#pro-methods" > Protected Member Functions< / a > |
< a href = "#pro-attribs" > Protected Attributes< / a > |
< a href = "classremwharead_1_1URI-members.html" > List of all members< / a > < / div >
< div class = "headertitle" >
< div class = "title" > remwharead::URI Class Reference< / div > < / div >
< / div > <!-- header -->
< div class = "contents" >
2019-07-28 02:30:30 +02:00
< p > Download, archive and process an URI.
< a href = "classremwharead_1_1URI.html#details" > More...< / a > < / p >
< p > < code > #include < < a class = "el" href = "uri_8hpp_source.html" > uri.hpp< / a > > < / code > < / p >
2019-07-27 22:46:58 +02:00
< table class = "memberdecls" >
< tr class = "heading" > < td colspan = "2" > < h2 class = "groupheader" > < a name = "pub-methods" > < / a >
Public Member Functions< / h2 > < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:acda508768b1fd3b4df81ea66dd4fab87" > < td class = "memItemLeft" align = "right" valign = "top" >   < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#acda508768b1fd3b4df81ea66dd4fab87" > URI< / a > (const string & uri)< / td > < / tr >
< tr class = "memdesc:acda508768b1fd3b4df81ea66dd4fab87" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Construct object and set URL. < a href = "#acda508768b1fd3b4df81ea66dd4fab87" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:acda508768b1fd3b4df81ea66dd4fab87" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a8d6ac084a823749ed38c12e7bf8f3461" > < td class = "memItemLeft" align = "right" valign = "top" > const < a class = "el" href = "structremwharead_1_1html__extract.html" > html_extract< / a >   < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a8d6ac084a823749ed38c12e7bf8f3461" > get< / a > ()< / td > < / tr >
< tr class = "memdesc:a8d6ac084a823749ed38c12e7bf8f3461" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Download URI and extract title, description and full text. < a href = "#a8d6ac084a823749ed38c12e7bf8f3461" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a8d6ac084a823749ed38c12e7bf8f3461" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a074b3aa3ec7658b184869599c9ab6445" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a074b3aa3ec7658b184869599c9ab6445" > archive< / a > ()< / td > < / tr >
< tr class = "memdesc:a074b3aa3ec7658b184869599c9ab6445" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Save URI in archive and return archive-URI. < a href = "#a074b3aa3ec7658b184869599c9ab6445" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a074b3aa3ec7658b184869599c9ab6445" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
< / table > < table class = "memberdecls" >
< tr class = "heading" > < td colspan = "2" > < h2 class = "groupheader" > < a name = "pro-methods" > < / a >
Protected Member Functions< / h2 > < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a203f46af6271ecd5a1ff01c63461d842" > < td class = "memItemLeft" align = "right" valign = "top" > void  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a203f46af6271ecd5a1ff01c63461d842" > set_curlpp_options< / a > (curlpp::Easy & request)< / td > < / tr >
< tr class = "memdesc:a203f46af6271ecd5a1ff01c63461d842" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Sets common curlpp options. < a href = "#a203f46af6271ecd5a1ff01c63461d842" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a203f46af6271ecd5a1ff01c63461d842" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a37f93c46371d9b3753ae04bd2ef2c362" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a37f93c46371d9b3753ae04bd2ef2c362" > extract_title< / a > (const string & html)< / td > < / tr >
< tr class = "memdesc:a37f93c46371d9b3753ae04bd2ef2c362" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Extract the title from an HTML page. < a href = "#a37f93c46371d9b3753ae04bd2ef2c362" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a37f93c46371d9b3753ae04bd2ef2c362" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:ae6ff7a41b9529eb8f4c7f2ace7260dc7" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#ae6ff7a41b9529eb8f4c7f2ace7260dc7" > extract_description< / a > (const string & html)< / td > < / tr >
< tr class = "memdesc:ae6ff7a41b9529eb8f4c7f2ace7260dc7" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Extract the description from an HTML page. < a href = "#ae6ff7a41b9529eb8f4c7f2ace7260dc7" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:ae6ff7a41b9529eb8f4c7f2ace7260dc7" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:ad6ad5351ecf2983e01f9f4a51c2057a5" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#ad6ad5351ecf2983e01f9f4a51c2057a5" > strip_html< / a > (const string & html)< / td > < / tr >
< tr class = "memdesc:ad6ad5351ecf2983e01f9f4a51c2057a5" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Removes HTML tags and superflous spaces from an HTML page. < a href = "#ad6ad5351ecf2983e01f9f4a51c2057a5" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:ad6ad5351ecf2983e01f9f4a51c2057a5" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a8b340b13ccf0bc3ae9059872ce48e06a" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a8b340b13ccf0bc3ae9059872ce48e06a" > remove_html_tags< / a > (const string & html, const string & tag=" " )< / td > < / tr >
< tr class = "memdesc:a8b340b13ccf0bc3ae9059872ce48e06a" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Remove HTML tags. < a href = "#a8b340b13ccf0bc3ae9059872ce48e06a" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a8b340b13ccf0bc3ae9059872ce48e06a" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a37c36dbf8ccc03c8cc132be29e49f4ec" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a37c36dbf8ccc03c8cc132be29e49f4ec" > unescape_html< / a > (const string & html)< / td > < / tr >
< tr class = "memdesc:a37c36dbf8ccc03c8cc132be29e49f4ec" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Convert HTML entities to UTF-8. < a href = "#a37c36dbf8ccc03c8cc132be29e49f4ec" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a37c36dbf8ccc03c8cc132be29e49f4ec" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
2019-07-28 02:30:30 +02:00
< tr class = "memitem:a9373cb28de198ae2db624980273ece4a" > < td class = "memItemLeft" align = "right" valign = "top" > const string  < / td > < td class = "memItemRight" valign = "bottom" > < a class = "el" href = "classremwharead_1_1URI.html#a9373cb28de198ae2db624980273ece4a" > remove_newlines< / a > (string text)< / td > < / tr >
< tr class = "memdesc:a9373cb28de198ae2db624980273ece4a" > < td class = "mdescLeft" >   < / td > < td class = "mdescRight" > Replace newlines with spaces. < a href = "#a9373cb28de198ae2db624980273ece4a" > More...< / a > < br / > < / td > < / tr >
2019-07-27 22:46:58 +02:00
< tr class = "separator:a9373cb28de198ae2db624980273ece4a" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
< / table > < table class = "memberdecls" >
< tr class = "heading" > < td colspan = "2" > < h2 class = "groupheader" > < a name = "pro-attribs" > < / a >
Protected Attributes< / h2 > < / td > < / tr >
< tr class = "memitem:a6d76848066779348084046a63bdaedc0" > < td class = "memItemLeft" align = "right" valign = "top" > < a id = "a6d76848066779348084046a63bdaedc0" > < / a >
string  < / td > < td class = "memItemRight" valign = "bottom" > < b > _uri< / b > < / td > < / tr >
< tr class = "separator:a6d76848066779348084046a63bdaedc0" > < td class = "memSeparator" colspan = "2" >   < / td > < / tr >
< / table >
2019-07-28 02:30:30 +02:00
< a name = "details" id = "details" > < / a > < h2 class = "groupheader" > Detailed Description< / h2 >
< div class = "textblock" > < p > Download, archive and process an URI. < / p >
< / div > < h2 class = "groupheader" > Constructor & Destructor Documentation< / h2 >
< a id = "acda508768b1fd3b4df81ea66dd4fab87" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#acda508768b1fd3b4df81ea66dd4fab87" > ◆ < / a > < / span > URI()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > remwharead::URI::URI < / td >
< td > (< / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > uri< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > explicit< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Construct object and set URL. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00044" > < / a > < span class = "lineno" > 44< / span >   :_uri(uri)< / div > < div class = "line" > < a name = "l00045" > < / a > < span class = "lineno" > 45< / span >   {< / div > < div class = "line" > < a name = "l00046" > < / a > < span class = "lineno" > 46< / span >   }< / div > < / div > <!-- fragment -->
< / div >
< / div >
< h2 class = "groupheader" > Member Function Documentation< / h2 >
< a id = "a074b3aa3ec7658b184869599c9ab6445" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a074b3aa3ec7658b184869599c9ab6445" > ◆ < / a > < / span > archive()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::archive < / td >
< td > (< / td >
< td class = "paramname" > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Save URI in archive and return archive-URI. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00476" > < / a > < span class = "lineno" > 476< / span >   {< / div > < div class = "line" > < a name = "l00477" > < / a > < span class = "lineno" > 477< / span >   < span class = "keywordflow" > if< / span > (_uri.substr(0, 4) != < span class = "stringliteral" > " http" < / span > )< / div > < div class = "line" > < a name = "l00478" > < / a > < span class = "lineno" > 478< / span >   {< / div > < div class = "line" > < a name = "l00479" > < / a > < span class = "lineno" > 479< / span >   < span class = "keywordflow" > return< / span > < span class = "stringliteral" > " " < / span > ;< / div > < div class = "line" > < a name = "l00480" > < / a > < span class = "lineno" > 480< / span >   }< / div > < div class = "line" > < a name = "l00481" > < / a > < span class = "lineno" > 481< / span >   < / div > < div class = "line" > < a name = "l00482" > < / a > < span class = "lineno" > 482< / span >   < span class = "keywordflow" > try< / span > < / div > < div class = "line" > < a name = "l00483" > < / a > < span class = "lineno" > 483< / span >   {< / div > < div class = "line" > < a name = "l00484" > < / a > < span class = "lineno" > 484< / span >   std::ostringstream oss;< / div > < div class = "line" > < a name = "l00485" > < / a > < span class = "lineno" > 485< / span >   curlpp::Easy request;< / div > < div class = "line" > < a name = "l00486" > < / a > < span class = "lineno" > 486< / span >   < a class = "code" href = "classremwharead_1_1URI.html#a203f46af6271ecd5a1ff01c63461d842" > set_curlpp_options< / a > (request);< / div > < div class = "line" > < a name = "l00487" > < / a > < span class = "lineno" > 487< / span >   request.setOpt< curlopts::Url> (< span class = "stringliteral" > " https://web.archive.org/save/" < / span > < / div > < div class = "line" > < a name = "l00488" > < / a > < span class = "lineno" > 488< / span >   + _uri);< / div > < div class = "line" > < a name = "l00489" > < / a > < span class = "lineno" > 489< / span >   request.setOpt< curlopts::WriteStream> (& oss);< / div > < div class = "line" > < a name = "l00490" > < / a > < span class = "lineno" > 490< / span >   request.setOpt< curlopts::NoBody> (< span class = "keyword" > true< / span > ); < span class = "comment" > // Make HEAD request.< / span > < / div > < div class = "line" > < a name = "l00491" > < / a > < span class = "lineno" > 491< / span >   request.setOpt< curlpp::options::Header> (< span class = "keyword" > true< / span > ); < span class = "comment" > // Keep headers.< / span > < / div > < div class = "line" > < a name = "l00492" > < / a > < span class = "lineno" > 492< / span >   request.perform();< / div > < div class = "line" > < a name = "l00493" > < / a > < span class = "lineno" > 493< / span >   < / div > < div class = "line" > < a name = "l00494" > < / a > < span class = "lineno" > 494< / span >   smatch match;< / div > < div class = "line" > < a name = "l00495" > < / a > < span class = "lineno" > 495< / span >   < span class = "keyword" > const< / span > < span class = "keywordtype" > string< / span > answer = oss.str();< / div > < div class = "line" > < a name = "l00496" > < / a > < span class = "lineno" > 496< / span >   < span class = "keywordflow" > if< / span > (regex_search(answer, match, regex(< span class = "stringliteral" > " Content-Location: (.+)\r" < / span > )))< / div > < div class = "line" > < a name = "l00497" > < / a > < span class = "lineno" > 497< / span >   {< / div > < div class = "line" > < a name = "l00498" > < / a > < span class = "lineno" > 498< / span >   < span class = "keywordflow" > return< / span > < span class = "stringliteral" > " https://web.archive.org" < / span > + match[1].str();< / div > < div class = "line" > < a name = "l00499" > < / a > < span class = "lineno" > 499< / span >   }< / div > < div class = "line" > < a name = "l00500" > < / a > < span class = "lineno" > 500< / span >   < span class = "keywordflow" > else< / span > < / div > < div class = "line" > < a name = "l00501" > < / a > < span class = "lineno" > 501< / span >   {< / div > < div class = "line" > < a name = "l00502" > < / a > < span class = "lineno" > 502< / span >   cerr < < < span class = "stringliteral" > " Error: Could not archive page. HTTP status: " < / span > < / div > < div class = "line" > < a name = "l00503" > < / a > < span class = "lineno" > 503< / span >   < < curlpp::info
< / div > <!-- fragment -->
< / div >
< / div >
< a id = "ae6ff7a41b9529eb8f4c7f2ace7260dc7" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#ae6ff7a41b9529eb8f4c7f2ace7260dc7" > ◆ < / a > < / span > extract_description()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::extract_description < / td >
< td > (< / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > html< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Extract the description from an HTML page. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00105" > < / a > < span class = "lineno" > 105< / span >   {< / div > < div class = "line" > < a name = "l00106" > < / a > < span class = "lineno" > 106< / span >   < span class = "keyword" > const< / span > regex re_htmlfile(< span class = "stringliteral" > " \\.(.?html?|xml|rss)$" < / span > );< / div > < div class = "line" > < a name = "l00107" > < / a > < span class = "lineno" > 107< / span >   < span class = "keywordflow" > if< / span > (_uri.substr(0, 4) == < span class = "stringliteral" > " http" < / span > || regex_search(_uri, re_htmlfile))< / div > < div class = "line" > < a name = "l00108" > < / a > < span class = "lineno" > 108< / span >   {< / div > < div class = "line" > < a name = "l00109" > < / a > < span class = "lineno" > 109< / span >   smatch match;< / div > < div class = "line" > < a name = "l00110" > < / a > < span class = "lineno" > 110< / span >   < span class = "keyword" > const< / span > regex re(< span class = "stringliteral" > " description\" [^> ]+content=\" ([^\" ]+)" < / span > , icase);< / div > < div class = "line" > < a name = "l00111" > < / a > < span class = "lineno" > 111< / span >   regex_search(html, match, re);< / div > < div class = "line" > < a name = "l00112" > < / a > < span class = "lineno" > 112< / span >   < span class = "keywordflow" > return< / span > < a class = "code" href = "classremwharead_1_1URI.html#a9373cb28de198ae2db624980273ece4a" > remove_newlines< / a > (< a class = "code" href = "classremwharead_1_1URI.html#ad6ad5351ecf2983e01f9f4a51c2057a5" > strip_html< / a > (match[1].str()));< / div > < div class = "line" > < a name = "l00113" > < / a > < span class = "lineno" > 113< / span >   }< / div > < div class = "line" > < a name = "l00114" > < / a > < span class = "lineno" > 114< / span >   < / div > < div class = "line" > < a name = "l00115" > < / a > < span class = "lineno" > 115< / span >   < span class = "keywordflow" > return< / span > < span class = "stringliteral" > " " < / span > ;< / div > < div class = "line" > < a name = "l00116" > < / a > < span class = "lineno" > 116< / span >   }< / div > < div class = "ttc" id = "classremwharead_1_1URI_html_ad6ad5351ecf2983e01f9f4a51c2057a5" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#ad6ad5351ecf2983e01f9f4a51c2057a5" > remwharead::URI::strip_html< / a > < / div > < div class = "ttdeci" > const string strip_html(const string & html)< / div > < div class = "ttdoc" > Removes HTML tags and superflous spaces from an HTML page. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:118< / div > < / div >
< div class = "ttc" id = "classremwharead_1_1URI_html_a9373cb28de198ae2db624980273ece4a" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a9373cb28de198ae2db624980273ece4a" > remwharead::URI::remove_newlines< / a > < / div > < div class = "ttdeci" > const string remove_newlines(string text)< / div > < div class = "ttdoc" > Replace newlines with spaces. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:514< / div > < / div >
< / div > <!-- fragment -->
< / div >
< / div >
< a id = "a37f93c46371d9b3753ae04bd2ef2c362" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a37f93c46371d9b3753ae04bd2ef2c362" > ◆ < / a > < / span > extract_title()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::extract_title < / td >
< td > (< / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > html< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Extract the title from an HTML page. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00092" > < / a > < span class = "lineno" > 92< / span >   {< / div > < div class = "line" > < a name = "l00093" > < / a > < span class = "lineno" > 93< / span >   < span class = "keyword" > const< / span > regex re_htmlfile(< span class = "stringliteral" > " \\.(.?html?|xml|rss)$" < / span > );< / div > < div class = "line" > < a name = "l00094" > < / a > < span class = "lineno" > 94< / span >   < span class = "keywordflow" > if< / span > (_uri.substr(0, 4) == < span class = "stringliteral" > " http" < / span > || regex_search(_uri, re_htmlfile))< / div > < div class = "line" > < a name = "l00095" > < / a > < span class = "lineno" > 95< / span >   {< / div > < div class = "line" > < a name = "l00096" > < / a > < span class = "lineno" > 96< / span >   smatch match;< / div > < div class = "line" > < a name = "l00097" > < / a > < span class = "lineno" > 97< / span >   regex_search(html, match, regex(< span class = "stringliteral" > " < title> ([^< ]+)" < / span > , icase));< / div > < div class = "line" > < a name = "l00098" > < / a > < span class = "lineno" > 98< / span >   < span class = "keywordflow" > return< / span > < a class = "code" href = "classremwharead_1_1URI.html#a9373cb28de198ae2db624980273ece4a" > remove_newlines< / a > (< a class = "code" href = "classremwharead_1_1URI.html#a37c36dbf8ccc03c8cc132be29e49f4ec" > unescape_html< / a > (match[1].str()));< / div > < div class = "line" > < a name = "l00099" > < / a > < span class = "lineno" > 99< / span >   }< / div > < div class = "line" > < a name = "l00100" > < / a > < span class = "lineno" > 100< / span >   < / div > < div class = "line" > < a name = "l00101" > < / a > < span class = "lineno" > 101< / span >   < span class = "keywordflow" > return< / span > < span class = "stringliteral" > " " < / span > ;< / div > < div class = "line" > < a name = "l00102" > < / a > < span class = "lineno" > 102< / span >   }< / div > < div class = "ttc" id = "classremwharead_1_1URI_html_a9373cb28de198ae2db624980273ece4a" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a9373cb28de198ae2db624980273ece4a" > remwharead::URI::remove_newlines< / a > < / div > < div class = "ttdeci" > const string remove_newlines(string text)< / div > < div class = "ttdoc" > Replace newlines with spaces. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:514< / div > < / div >
< div class = "ttc" id = "classremwharead_1_1URI_html_a37c36dbf8ccc03c8cc132be29e49f4ec" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a37c36dbf8ccc03c8cc132be29e49f4ec" > remwharead::URI::unescape_html< / a > < / div > < div class = "ttdeci" > const string unescape_html(const string & html)< / div > < div class = "ttdoc" > Convert HTML entities to UTF-8. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:175< / div > < / div >
< / div > <!-- fragment -->
< / div >
< / div >
< a id = "a8d6ac084a823749ed38c12e7bf8f3461" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a8d6ac084a823749ed38c12e7bf8f3461" > ◆ < / a > < / span > get()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "memname" >
< tr >
< td class = "memname" > const < a class = "el" href = "structremwharead_1_1html__extract.html" > html_extract< / a > remwharead::URI::get < / td >
< td > (< / td >
< td class = "paramname" > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Download URI and extract title, description and full text. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00049" > < / a > < span class = "lineno" > 49< / span >   {< / div > < div class = "line" > < a name = "l00050" > < / a > < span class = "lineno" > 50< / span >   < span class = "keywordflow" > try< / span > < / div > < div class = "line" > < a name = "l00051" > < / a > < span class = "lineno" > 51< / span >   {< / div > < div class = "line" > < a name = "l00052" > < / a > < span class = "lineno" > 52< / span >   std::ostringstream oss;< / div > < div class = "line" > < a name = "l00053" > < / a > < span class = "lineno" > 53< / span >   curlpp::Easy request;< / div > < div class = "line" > < a name = "l00054" > < / a > < span class = "lineno" > 54< / span >   < a class = "code" href = "classremwharead_1_1URI.html#a203f46af6271ecd5a1ff01c63461d842" > set_curlpp_options< / a > (request);< / div > < div class = "line" > < a name = "l00055" > < / a > < span class = "lineno" > 55< / span >   request.setOpt< curlopts::Url> (_uri);< / div > < div class = "line" > < a name = "l00056" > < / a > < span class = "lineno" > 56< / span >   request.setOpt< curlopts::WriteStream> (& oss);< / div > < div class = "line" > < a name = "l00057" > < / a > < span class = "lineno" > 57< / span >   request.perform();< / div > < div class = "line" > < a name = "l00058" > < / a > < span class = "lineno" > 58< / span >   < / div > < div class = "line" > < a name = "l00059" > < / a > < span class = "lineno" > 59< / span >   < span class = "keyword" > const< / span > < span class = "keywordtype" > string< / span > answer = oss.str();< / div > < div class = "line" > < a name = "l00060" > < / a > < span class = "lineno" > 60< / span >   < span class = "keywordflow" > if< / span > (answer.empty())< / div > < div class = "line" > < a name = "l00061" > < / a > < span class = "lineno" > 61< / span >   {< / div > < div class = "line" > < a name = "l00062" > < / a > < span class = "lineno" > 62< / span >   cerr < < < span class = "stringliteral" > " Error: Could not download page. Response code: " < / span > < / div > < div class = "line" > < a name = "l00063" > < / a > < span class = "lineno" > 63< / span >   < < curlpp::infos::ResponseCode::get(request) < < endl;< / div > < div class = "line" > < a name = "l00064" > < / a > < span class = "lineno" > 64< / span >   }< / div > < div class = "line" > < a name = "l00065" > < / a > < span class = "lineno" > 65< / span >   < span class = "keywordflow" > else< / span > < / div > < div class = "line" > < a name = "l00066" > < / a > < span class = "lineno" > 66< / span >   {< / div > < div class = "line" > < a name = "l00067" > < / a > < span class = "lineno" > 67< / span >   < span class = "keywordflow" > return< / span > < / div > < div class = "line" > < a name = "l00068" > < / a > < span class = "lineno" > 68< / span >   {< / div > < div class = "line" > < a name = "l00069" > < / a > < span class = "lineno" > 69< / span >   < a class = "code" href = "classremwharead_1_1URI.html#a37f93c46371d9b3753ae04bd2ef2c362" > extract_title< / a > (answer),< / div > < div class = "line" > < a name = "l00070" > < / a > < span class = "lineno" > 70< / span >   < a class = "code" href = "classremwharead_1_1URI.html#ae6ff7a41b9529eb8f4c7f2ace7260dc7" > extract_description< / a > (answer),< / div > < div class = "line" > < a name = "l00071" > < / a > < span class = "lineno" > 71< / span >   < a class = "code" href = "classremwharead_1_1URI.html#ad6ad5351ecf2983e01f9f4a51c2057a5" > strip_html< / a > (answer)< / div > < div class = "line" > < a name = "l00072" > < / a > < span class = "lineno" > 72< / span >   };< / div > < div class = "line" > < a name = "l00073" > < / a > < span class = "lineno" > 73< / span >   }< / div > < div class = "line" > < a name = "l00074" > < / a > < span class = "lineno" > 74< / span >   }< / div > < div class = "line" > < a name = "l00075" > < / a > < span class = "lineno" > 75< / span >   < span class = "keywordflow" > catch< / span > (< span class = "keyword" > const< / span > std::exception & e)< / div > < div class = "line" > < a name = "l00076" > < / a > < span class = "lineno" > 76< / span >   {< / div > < div class = "line" > < a name = "l00077" > < / a > < span class = "lineno" > 77< / span >   cerr < < < span class = "stringliteral" > " Error in " < / span > < < __func__ < < < span class = "string
< div class = "ttc" id = "classremwharead_1_1URI_html_a203f46af6271ecd5a1ff01c63461d842" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a203f46af6271ecd5a1ff01c63461d842" > remwharead::URI::set_curlpp_options< / a > < / div > < div class = "ttdeci" > void set_curlpp_options(curlpp::Easy & request)< / div > < div class = "ttdoc" > Sets common curlpp options. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:83< / div > < / div >
< div class = "ttc" id = "classremwharead_1_1URI_html_a37f93c46371d9b3753ae04bd2ef2c362" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a37f93c46371d9b3753ae04bd2ef2c362" > remwharead::URI::extract_title< / a > < / div > < div class = "ttdeci" > const string extract_title(const string & html)< / div > < div class = "ttdoc" > Extract the title from an HTML page. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:91< / div > < / div >
< div class = "ttc" id = "classremwharead_1_1URI_html_ae6ff7a41b9529eb8f4c7f2ace7260dc7" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#ae6ff7a41b9529eb8f4c7f2ace7260dc7" > remwharead::URI::extract_description< / a > < / div > < div class = "ttdeci" > const string extract_description(const string & html)< / div > < div class = "ttdoc" > Extract the description from an HTML page. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:104< / div > < / div >
< / div > <!-- fragment -->
< / div >
< / div >
< a id = "a8b340b13ccf0bc3ae9059872ce48e06a" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a8b340b13ccf0bc3ae9059872ce48e06a" > ◆ < / a > < / span > remove_html_tags()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::remove_html_tags < / td >
< td > (< / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > html< / em > , < / td >
< / tr >
< tr >
< td class = "paramkey" > < / td >
< td > < / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > tag< / em > = < code > " " < / code >   < / td >
< / tr >
< tr >
< td > < / td >
< td > )< / td >
< td > < / td > < td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Remove HTML tags. < / p >
< dl class = "params" > < dt > Parameters< / dt > < dd >
< table class = "params" >
< tr > < td class = "paramname" > html< / td > < td > HTML page. < / td > < / tr >
< tr > < td class = "paramname" > tag< / td > < td > If set, only remove this tag. < / td > < / tr >
< / table >
< / dd >
< / dl >
< div class = "fragment" > < div class = "line" > < a name = "l00138" > < / a > < span class = "lineno" > 138< / span >   {< / div > < div class = "line" > < a name = "l00139" > < / a > < span class = "lineno" > 139< / span >   < span class = "comment" > // NOTE: I did this with regex_replace before, but libstdc++ segfaulted.< / span > < / div > < div class = "line" > < a name = "l00140" > < / a > < span class = "lineno" > 140< / span >   < span class = "keywordtype" > string< / span > out;< / div > < div class = "line" > < a name = "l00141" > < / a > < span class = "lineno" > 141< / span >   < span class = "keywordflow" > if< / span > (tag.empty())< / div > < div class = "line" > < a name = "l00142" > < / a > < span class = "lineno" > 142< / span >   {< / div > < div class = "line" > < a name = "l00143" > < / a > < span class = "lineno" > 143< / span >   < span class = "keywordtype" > size_t< / span > pos = 0;< / div > < div class = "line" > < a name = "l00144" > < / a > < span class = "lineno" > 144< / span >   < span class = "keywordflow" > while< / span > (pos != std::string::npos)< / div > < div class = "line" > < a name = "l00145" > < / a > < span class = "lineno" > 145< / span >   {< / div > < div class = "line" > < a name = "l00146" > < / a > < span class = "lineno" > 146< / span >   < span class = "keywordtype" > size_t< / span > startpos = html.find(< span class = "charliteral" > ' < ' < / span > , pos);< / div > < div class = "line" > < a name = "l00147" > < / a > < span class = "lineno" > 147< / span >   < span class = "keywordtype" > size_t< / span > endpos = html.find(< span class = "charliteral" > ' > ' < / span > , startpos);< / div > < div class = "line" > < a name = "l00148" > < / a > < span class = "lineno" > 148< / span >   out += html.substr(pos, startpos - pos);< / div > < div class = "line" > < a name = "l00149" > < / a > < span class = "lineno" > 149< / span >   pos = endpos;< / div > < div class = "line" > < a name = "l00150" > < / a > < span class = "lineno" > 150< / span >   < span class = "keywordflow" > if< / span > (pos != std::string::npos)< / div > < div class = "line" > < a name = "l00151" > < / a > < span class = "lineno" > 151< / span >   {< / div > < div class = "line" > < a name = "l00152" > < / a > < span class = "lineno" > 152< / span >   ++pos;< / div > < div class = "line" > < a name = "l00153" > < / a > < span class = "lineno" > 153< / span >   }< / div > < div class = "line" > < a name = "l00154" > < / a > < span class = "lineno" > 154< / span >   }< / div > < div class = "line" > < a name = "l00155" > < / a > < span class = "lineno" > 155< / span >   }< / div > < div class = "line" > < a name = "l00156" > < / a > < span class = "lineno" > 156< / span >   < span class = "keywordflow" > else< / span > < / div > < div class = "line" > < a name = "l00157" > < / a > < span class = "lineno" > 157< / span >   {< / div > < div class = "line" > < a name = "l00158" > < / a > < span class = "lineno" > 158< / span >   < span class = "keywordtype" > size_t< / span > pos = 0;< / div > < div class = "line" > < a name = "l00159" > < / a > < span class = "lineno" > 159< / span >   out = html;< / div > < div class = "line" > < a name = "l00160" > < / a > < span class = "lineno" > 160< / span >   < span class = "keywordflow" > while< / span > ((pos = out.find(< span class = "stringliteral" > " < " < / span > + tag)) != std::string::npos)< / div > < div class = "line" > < a name = "l00161" > < / a > < span class = "lineno" > 161< / span >   {< / div > < div class = "line" > < a name = "l00162" > < / a > < span class = "lineno" > 162< / span >   < span class = "keywordtype" > size_t< / span > endpos = out.find(< span class = "stringliteral" > " < /" < / span > + tag, pos);< / div > < div class = "line" > < a name = "l00163" > < / a > < span class = "lineno" > 163< / span >   < span class = "keywordflow" > if< / span > (endpos == std::string::npos)< / div > < div class = "line" > < a name = "l00164" > < / a > < span class = "lineno" > 164< / span >   {< / div > < div class = "line" > < a name = "l00165" > < / a > < span class = "lineno" > 165< / span >   < span class = "keywordflow" > break< / span > ;< / div > < div class = "line" > < a name = "l00166" > < / a > < span class = "lineno" > 166< / span >   }< / div > < div class = "line" > < a name = "l00167" > < / a > < span class = "lineno" > 167< / span >   endpos += 3 + tag.length(); < span class
< / div >
< / div >
< a id = "a9373cb28de198ae2db624980273ece4a" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a9373cb28de198ae2db624980273ece4a" > ◆ < / a > < / span > remove_newlines()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::remove_newlines < / td >
< td > (< / td >
< td class = "paramtype" > string  < / td >
< td class = "paramname" > < em > text< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Replace newlines with spaces. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00515" > < / a > < span class = "lineno" > 515< / span >   {< / div > < div class = "line" > < a name = "l00516" > < / a > < span class = "lineno" > 516< / span >   < span class = "keywordtype" > size_t< / span > posn = 0;< / div > < div class = "line" > < a name = "l00517" > < / a > < span class = "lineno" > 517< / span >   < span class = "keywordflow" > while< / span > ((posn = text.find(< span class = "charliteral" > ' \n' < / span > , posn)) != std::string::npos)< / div > < div class = "line" > < a name = "l00518" > < / a > < span class = "lineno" > 518< / span >   {< / div > < div class = "line" > < a name = "l00519" > < / a > < span class = "lineno" > 519< / span >   text.replace(posn, 1, < span class = "stringliteral" > " " < / span > );< / div > < div class = "line" > < a name = "l00520" > < / a > < span class = "lineno" > 520< / span >   < / div > < div class = "line" > < a name = "l00521" > < / a > < span class = "lineno" > 521< / span >   < span class = "keywordtype" > size_t< / span > posr = posn - 1;< / div > < div class = "line" > < a name = "l00522" > < / a > < span class = "lineno" > 522< / span >   < span class = "keywordflow" > if< / span > (text[posr] == < span class = "charliteral" > ' \r' < / span > )< / div > < div class = "line" > < a name = "l00523" > < / a > < span class = "lineno" > 523< / span >   {< / div > < div class = "line" > < a name = "l00524" > < / a > < span class = "lineno" > 524< / span >   text.replace(posr, 1, < span class = "stringliteral" > " " < / span > );< / div > < div class = "line" > < a name = "l00525" > < / a > < span class = "lineno" > 525< / span >   }< / div > < div class = "line" > < a name = "l00526" > < / a > < span class = "lineno" > 526< / span >   ++posn;< / div > < div class = "line" > < a name = "l00527" > < / a > < span class = "lineno" > 527< / span >   }< / div > < div class = "line" > < a name = "l00528" > < / a > < span class = "lineno" > 528< / span >   < / div > < div class = "line" > < a name = "l00529" > < / a > < span class = "lineno" > 529< / span >   < span class = "keywordflow" > return< / span > text;< / div > < div class = "line" > < a name = "l00530" > < / a > < span class = "lineno" > 530< / span >   }< / div > < / div > <!-- fragment -->
< / div >
< / div >
< a id = "a203f46af6271ecd5a1ff01c63461d842" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a203f46af6271ecd5a1ff01c63461d842" > ◆ < / a > < / span > set_curlpp_options()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > void remwharead::URI::set_curlpp_options < / td >
< td > (< / td >
< td class = "paramtype" > curlpp::Easy &   < / td >
< td class = "paramname" > < em > request< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Sets common curlpp options. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00084" > < / a > < span class = "lineno" > 84< / span >   {< / div > < div class = "line" > < a name = "l00085" > < / a > < span class = "lineno" > 85< / span >   request.setOpt< curlopts::UserAgent> (string(< span class = "stringliteral" > " remwharead/" < / span > )< / div > < div class = "line" > < a name = "l00086" > < / a > < span class = "lineno" > 86< / span >   + global::version);< / div > < div class = "line" > < a name = "l00087" > < / a > < span class = "lineno" > 87< / span >   request.setOpt< curlopts::HttpHeader> ({ < span class = "stringliteral" > " Connection: close" < / span > });< / div > < div class = "line" > < a name = "l00088" > < / a > < span class = "lineno" > 88< / span >   request.setOpt< curlopts::FollowLocation> (< span class = "keyword" > true< / span > );< / div > < div class = "line" > < a name = "l00089" > < / a > < span class = "lineno" > 89< / span >   }< / div > < / div > <!-- fragment -->
< / div >
< / div >
< a id = "ad6ad5351ecf2983e01f9f4a51c2057a5" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#ad6ad5351ecf2983e01f9f4a51c2057a5" > ◆ < / a > < / span > strip_html()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::strip_html < / td >
< td > (< / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > html< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Removes HTML tags and superflous spaces from an HTML page. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00119" > < / a > < span class = "lineno" > 119< / span >   {< / div > < div class = "line" > < a name = "l00120" > < / a > < span class = "lineno" > 120< / span >   < span class = "keywordtype" > string< / span > out;< / div > < div class = "line" > < a name = "l00121" > < / a > < span class = "lineno" > 121< / span >   < / div > < div class = "line" > < a name = "l00122" > < / a > < span class = "lineno" > 122< / span >   out = < a class = "code" href = "classremwharead_1_1URI.html#a8b340b13ccf0bc3ae9059872ce48e06a" > remove_html_tags< / a > (html, < span class = "stringliteral" > " script" < / span > ); < span class = "comment" > // Remove JavaScript.< / span > < / div > < div class = "line" > < a name = "l00123" > < / a > < span class = "lineno" > 123< / span >   out = < a class = "code" href = "classremwharead_1_1URI.html#a8b340b13ccf0bc3ae9059872ce48e06a" > remove_html_tags< / a > (out, < span class = "stringliteral" > " style" < / span > ); < span class = "comment" > // Remove CSS.< / span > < / div > < div class = "line" > < a name = "l00124" > < / a > < span class = "lineno" > 124< / span >   out = < a class = "code" href = "classremwharead_1_1URI.html#a8b340b13ccf0bc3ae9059872ce48e06a" > remove_html_tags< / a > (out); < span class = "comment" > // Remove tags.< / span > < / div > < div class = "line" > < a name = "l00125" > < / a > < span class = "lineno" > 125< / span >   < / div > < div class = "line" > < a name = "l00126" > < / a > < span class = "lineno" > 126< / span >   < span class = "keywordtype" > size_t< / span > pos = 0;< / div > < div class = "line" > < a name = "l00127" > < / a > < span class = "lineno" > 127< / span >   < span class = "keywordflow" > while< / span > ((pos = out.find(< span class = "stringliteral" > " \r" < / span > , pos)) != std::string::npos) < span class = "comment" > // Remove CR.< / span > < / div > < div class = "line" > < a name = "l00128" > < / a > < span class = "lineno" > 128< / span >   {< / div > < div class = "line" > < a name = "l00129" > < / a > < span class = "lineno" > 129< / span >   out.replace(pos, 1, < span class = "stringliteral" > " " < / span > );< / div > < div class = "line" > < a name = "l00130" > < / a > < span class = "lineno" > 130< / span >   }< / div > < div class = "line" > < a name = "l00131" > < / a > < span class = "lineno" > 131< / span >   < / div > < div class = "line" > < a name = "l00132" > < / a > < span class = "lineno" > 132< / span >   out = regex_replace(out, regex(< span class = "stringliteral" > " \\s+\n" < / span > ), < span class = "stringliteral" > " \n" < / span > ); < span class = "comment" > // Remove space at eol.< / span > < / div > < div class = "line" > < a name = "l00133" > < / a > < span class = "lineno" > 133< / span >   out = regex_replace(out, regex(< span class = "stringliteral" > " \n{2,}" < / span > ), < span class = "stringliteral" > " \n" < / span > ); < span class = "comment" > // Reduce newlines.< / span > < / div > < div class = "line" > < a name = "l00134" > < / a > < span class = "lineno" > 134< / span >   < / div > < div class = "line" > < a name = "l00135" > < / a > < span class = "lineno" > 135< / span >   < span class = "keywordflow" > return< / span > < a class = "code" href = "classremwharead_1_1URI.html#a37c36dbf8ccc03c8cc132be29e49f4ec" > unescape_html< / a > (out);< / div > < div class = "line" > < a name = "l00136" > < / a > < span class = "lineno" > 136< / span >   }< / div > < div class = "ttc" id = "classremwharead_1_1URI_html_a8b340b13ccf0bc3ae9059872ce48e06a" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a8b340b13ccf0bc3ae9059872ce48e06a" > remwharead::URI::remove_html_tags< / a > < / div > < div class = "ttdeci" > const string remove_html_tags(const string & html, const string & tag=" " )< / div > < div class = "ttdoc" > Remove HTML tags. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:137< / div > < / div >
< div class = "ttc" id = "classremwharead_1_1URI_html_a37c36dbf8ccc03c8cc132be29e49f4ec" > < div class = "ttname" > < a href = "classremwharead_1_1URI.html#a37c36dbf8ccc03c8cc132be29e49f4ec" > remwharead::URI::unescape_html< / a > < / div > < div class = "ttdeci" > const string unescape_html(const string & html)< / div > < div class = "ttdoc" > Convert HTML entities to UTF-8. < / div > < div class = "ttdef" > < b > Definition:< / b > uri.cpp:175< / div > < / div >
< / div > <!-- fragment -->
< / div >
< / div >
< a id = "a37c36dbf8ccc03c8cc132be29e49f4ec" > < / a >
< h2 class = "memtitle" > < span class = "permalink" > < a href = "#a37c36dbf8ccc03c8cc132be29e49f4ec" > ◆ < / a > < / span > unescape_html()< / h2 >
< div class = "memitem" >
< div class = "memproto" >
< table class = "mlabels" >
< tr >
< td class = "mlabels-left" >
< table class = "memname" >
< tr >
< td class = "memname" > const string remwharead::URI::unescape_html < / td >
< td > (< / td >
< td class = "paramtype" > const string &   < / td >
< td class = "paramname" > < em > html< / em > < / td > < td > )< / td >
< td > < / td >
< / tr >
< / table >
< / td >
< td class = "mlabels-right" >
< span class = "mlabels" > < span class = "mlabel" > protected< / span > < / span > < / td >
< / tr >
< / table >
< / div > < div class = "memdoc" >
< p > Convert HTML entities to UTF-8. < / p >
< div class = "fragment" > < div class = "line" > < a name = "l00176" > < / a > < span class = "lineno" > 176< / span >   {< / div > < div class = "line" > < a name = "l00177" > < / a > < span class = "lineno" > 177< / span >   < span class = "keywordtype" > string< / span > buffer = html;< / div > < div class = "line" > < a name = "l00178" > < / a > < span class = "lineno" > 178< / span >   < span class = "keywordtype" > string< / span > output;< / div > < div class = "line" > < a name = "l00179" > < / a > < span class = "lineno" > 179< / span >   < / div > < div class = "line" > < a name = "l00180" > < / a > < span class = "lineno" > 180< / span >   < span class = "comment" > // Used to convert int to utf-8 char.< / span > < / div > < div class = "line" > < a name = "l00181" > < / a > < span class = "lineno" > 181< / span >   std::wstring_convert< std::codecvt_utf8< char32_t> , char32_t> u8c;< / div > < div class = "line" > < a name = "l00182" > < / a > < span class = "lineno" > 182< / span >   regex re_entity(< span class = "stringliteral" > " & #(x)?([[:alnum:]]{1,8});" < / span > );< / div > < div class = "line" > < a name = "l00183" > < / a > < span class = "lineno" > 183< / span >   smatch match;< / div > < div class = "line" > < a name = "l00184" > < / a > < span class = "lineno" > 184< / span >   < / div > < div class = "line" > < a name = "l00185" > < / a > < span class = "lineno" > 185< / span >   < span class = "keywordflow" > while< / span > (regex_search(buffer, match, re_entity))< / div > < div class = "line" > < a name = "l00186" > < / a > < span class = "lineno" > 186< / span >   {< / div > < div class = "line" > < a name = "l00187" > < / a > < span class = "lineno" > 187< / span >   char32_t codepoint = 0;< / div > < div class = "line" > < a name = "l00188" > < / a > < span class = "lineno" > 188< / span >   < span class = "comment" > // ' x' in front of the number means it' s hexadecimal, else decimal.< / span > < / div > < div class = "line" > < a name = "l00189" > < / a > < span class = "lineno" > 189< / span >   < span class = "keywordflow" > if< / span > (match[1].length() == 1)< / div > < div class = "line" > < a name = "l00190" > < / a > < span class = "lineno" > 190< / span >   {< / div > < div class = "line" > < a name = "l00191" > < / a > < span class = "lineno" > 191< / span >   codepoint = std::stoi(match[2].str(), < span class = "keyword" > nullptr< / span > , 16);< / div > < div class = "line" > < a name = "l00192" > < / a > < span class = "lineno" > 192< / span >   }< / div > < div class = "line" > < a name = "l00193" > < / a > < span class = "lineno" > 193< / span >   < span class = "keywordflow" > else< / span > < / div > < div class = "line" > < a name = "l00194" > < / a > < span class = "lineno" > 194< / span >   {< / div > < div class = "line" > < a name = "l00195" > < / a > < span class = "lineno" > 195< / span >   codepoint = std::stoi(match[2].str(), < span class = "keyword" > nullptr< / span > , 10);< / div > < div class = "line" > < a name = "l00196" > < / a > < span class = "lineno" > 196< / span >   }< / div > < div class = "line" > < a name = "l00197" > < / a > < span class = "lineno" > 197< / span >   output += match.prefix().str() + u8c.to_bytes(codepoint);< / div > < div class = "line" > < a name = "l00198" > < / a > < span class = "lineno" > 198< / span >   buffer = match.suffix().str();< / div > < div class = "line" > < a name = "l00199" > < / a > < span class = "lineno" > 199< / span >   }< / div > < div class = "line" > < a name = "l00200" > < / a > < span class = "lineno" > 200< / span >   output += buffer;< / div > < div class = "line" > < a name = "l00201" > < / a > < span class = "lineno" > 201< / span >   < / div > < div class = "line" > < a name = "l00202" > < / a > < span class = "lineno" > 202< / span >   < span class = "comment" > // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_< / span > < / div > < div class = "line" > < a name = "l00203" > < / a > < span class = "lineno" > 203< / span >   < span class = "comment" > // entity_references#Character_entity_references_in_HTML< / span > < / div > < div class = "line" > < a name = "l00204" > < / a > < span class = "lineno" > 204< / span >   < span class = "keyword" > const< / span > array< const std::pair< const string, const char32_t> , 258> names =< / div > < div class = "line" > < a name = "l00205" > < / a > < span class = "lineno" > 205< / span >   {{< / div > < div class = "line" > < a name = "l00206" > < / a > < span
< / div >
< / div >
2019-07-27 22:46:58 +02:00
< hr / > The documentation for this class was generated from the following files:< ul >
< li > src/lib/< a class = "el" href = "uri_8hpp_source.html" > uri.hpp< / a > < / li >
< li > src/lib/uri.cpp< / li >
< / ul >
< / div > <!-- contents -->
<!-- start footer part -->
< hr class = "footer" / > < address class = "footer" > < small >
Generated by   < a href = "http://www.doxygen.org/index.html" >
< img class = "footer" src = "doxygen.png" alt = "doxygen" / >
< / a > 1.8.14
< / small > < / address >
< / body >
< / html >