Documented URI.
This commit is contained in:
parent
610173179d
commit
4991f7b006
|
@ -24,6 +24,7 @@ namespace remwharead
|
||||||
{
|
{
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
||||||
|
//! A processed HTML page.
|
||||||
typedef struct html_extract
|
typedef struct html_extract
|
||||||
{
|
{
|
||||||
string title;
|
string title;
|
||||||
|
@ -31,27 +32,46 @@ namespace remwharead
|
||||||
string fulltext;
|
string fulltext;
|
||||||
} html_extract;
|
} html_extract;
|
||||||
|
|
||||||
|
//! Download, archive and process an URI.
|
||||||
class URI
|
class URI
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
//! Construct object and set URL.
|
||||||
explicit URI(const string &uri);
|
explicit URI(const string &uri);
|
||||||
|
|
||||||
//! Download URI and extract title, description and full text.
|
//! Download URI and extract title, description and full text.
|
||||||
const html_extract get();
|
const html_extract get();
|
||||||
//! Save URI in archive and return URI.
|
|
||||||
|
//! Save URI in archive and return archive-URI.
|
||||||
const string archive();
|
const string archive();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
string _uri;
|
string _uri;
|
||||||
|
|
||||||
|
//! Sets common curlpp options.
|
||||||
void set_curlpp_options(curlpp::Easy &request);
|
void set_curlpp_options(curlpp::Easy &request);
|
||||||
|
|
||||||
|
//! Extract the title from an HTML page.
|
||||||
const string extract_title(const string &html);
|
const string extract_title(const string &html);
|
||||||
|
|
||||||
|
//! Extract the description from an HTML page.
|
||||||
const string extract_description(const string &html);
|
const string extract_description(const string &html);
|
||||||
|
|
||||||
|
//! Removes HTML tags and superflous spaces from an HTML page.
|
||||||
const string strip_html(const string &html);
|
const string strip_html(const string &html);
|
||||||
//! Remove all HTML tags. If tag is not empty, remove only this tag.
|
|
||||||
|
/*!
|
||||||
|
* @brief Remove HTML tags.
|
||||||
|
*
|
||||||
|
* @param html HTML page.
|
||||||
|
* @param tag If set, only remove this tag.
|
||||||
|
*/
|
||||||
const string remove_html_tags(const string &html,
|
const string remove_html_tags(const string &html,
|
||||||
const string &tag = "");
|
const string &tag = "");
|
||||||
|
|
||||||
|
//! Convert HTML entities to UTF-8.
|
||||||
const string unescape_html(const string &html);
|
const string unescape_html(const string &html);
|
||||||
|
|
||||||
//! Replace newlines with spaces.
|
//! Replace newlines with spaces.
|
||||||
const string remove_newlines(string text);
|
const string remove_newlines(string text);
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue