remwharead/include/uri.hpp

/*  This file is part of remwharead.
 *  Copyright © 2019 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef REMWHAREAD_URI_HPP
#define REMWHAREAD_URI_HPP

#include <cstdint>
#include <string>

namespace remwharead
{
using std::uint16_t;
using std::string;

/*!
 *  @brief  A processed HTML page.
 *
 *  @return true if successful, when cast to bool.
 *
 *  @since  0.7.0
 *
 *  @headerfile uri.hpp remwharead/uri.hpp
 */
struct html_extract
{
    bool successful = false;
    string error;
    string title;
    string description;
    string fulltext;

    explicit operator bool();
};

/*!
 *  @brief  The result of the call to the archive service.
 *
 *  @return true if successful, when cast to bool.
 *
 *  @since  0.7.0
 *
 *  @headerfile uri.hpp remwharead/uri.hpp
 */
struct archive_answer
{
    bool successful = false;
    string error;
    string uri;

    explicit operator bool();
};

/*!
 *  @brief  Download, archive and process an %URI.
 *
 *  @since  0.6.0
 *
 *  @headerfile uri.hpp remwharead/uri.hpp
 */
class URI
{
public:
    /*!
     *  @brief  Construct object and set URL.
     *
     *  Initializes TLS and sets proxy from the environment variable
     *  `http_proxy`, if possible.
     *
     *  @since  0.6.0
     */
    explicit URI(string uri);
    virtual ~URI();

    URI(const URI &other) = default;
    URI &operator=(const URI &other) = default;
    URI(URI &&other) = default;
    URI &operator=(URI &&other) = default;

    /*!
     *  @brief  Download %URI and extract title, description and full text.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    html_extract get();

    /*!
     *  @brief  Save %URI in archive and return archive-URI.
     *
     *  @since  0.6.0
    */
    [[nodiscard]]
    archive_answer archive() const;

protected:
    string _uri;
    string _document;

    /*!
     *  @brief  Make a HTTP(S) request.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string make_request(const string &uri, bool archive = false) const;

    /*!
     *  @brief  Extract the title from an HTML page.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string extract_title() const;

    /*!
     *  @brief  Extract the description from an HTML page.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string extract_description() const;

    /*!
     *  @brief  Removes HTML tags and superflous spaces from an HTML page.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string strip_html() const;

    /*!
     *  @brief  Remove HTML tags.
     *
     *  @param  html HTML page.
     *  @param  tag  If set, only remove this tag.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string remove_html_tags(const string &html, const string &tag = "") const;

    /*!
     *  @brief  Convert HTML entities to UTF-8.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string unescape_html(string html) const;

    /*!
     *  @brief  Replace newlines with spaces.
     *
     *  @since  0.6.0
     */
    [[nodiscard]]
    string remove_newlines(string text) const;

    /*!
     *  @brief  Set proxy server.
     *
     *  @since  0.8.5
     */
    void set_proxy();

    /*!
     *  @brief  Limits text to N characters, cuts at space.
     *
     *  @since  0.8.5
     */
    [[nodiscard]]
    string cut_text(const string &text, uint16_t n_chars) const;

    /*!
     *  @brief  Returns true if document is *HTML.
     *
     *  @since  0.9.2
     */
    bool is_html() const;
};
} // namespace remwharead

#endif  // REMWHAREAD_URI_HPP
Fetch page, extract title, description and full text. 2019-05-15 07:04:12 +02:00			`/* This file is part of remwharead.`
			`* Copyright © 2019 tastytea <tastytea@tastytea.de>`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, version 3.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

Renamed URL to URI. 2019-05-16 08:36:35 +02:00			`#ifndef REMWHAREAD_URI_HPP`
			`#define REMWHAREAD_URI_HPP`
Fetch page, extract title, description and full text. 2019-05-15 07:04:12 +02:00
Cut descriptions at 500 characters. 2019-10-15 15:10:39 +02:00			`#include <cstdint>`
Fetch page, extract title, description and full text. 2019-05-15 07:04:12 +02:00			`#include <string>`

namespaced library. 2019-07-27 09:59:43 +02:00			`namespace remwharead`
Fetch page, extract title, description and full text. 2019-05-15 07:04:12 +02:00			`{`
Cut descriptions at 500 characters. 2019-10-15 15:10:39 +02:00			`using std::uint16_t;`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`using std::string;`

			`/*!`
			`* @brief A processed HTML page.`
			`*`
			`* @return true if successful, when cast to bool.`
			`*`
			`* @since 0.7.0`
			`*`
			`* @headerfile uri.hpp remwharead/uri.hpp`
			`*/`
			`struct html_extract`
			`{`
			`bool successful = false;`
			`string error;`
			`string title;`
			`string description;`
			`string fulltext;`

			`explicit operator bool();`
			`};`

			`/*!`
			`* @brief The result of the call to the archive service.`
			`*`
			`* @return true if successful, when cast to bool.`
			`*`
			`* @since 0.7.0`
			`*`
			`* @headerfile uri.hpp remwharead/uri.hpp`
			`*/`
			`struct archive_answer`
			`{`
			`bool successful = false;`
			`string error;`
			`string uri;`

			`explicit operator bool();`
			`};`

			`/*!`
			`* @brief Download, archive and process an %URI.`
			`*`
			`* @since 0.6.0`
			`*`
			`* @headerfile uri.hpp remwharead/uri.hpp`
			`*/`
			`class URI`
			`{`
			`public:`
			`/*!`
			`* @brief Construct object and set URL.`
			`*`
			`* Initializes TLS and sets proxy from the environment variable`
			* `http_proxy`, if possible.
			`*`
			`* @since 0.6.0`
			`*/`
			`explicit URI(string uri);`
			`virtual ~URI();`

			`URI(const URI &other) = default;`
			`URI &operator=(const URI &other) = default;`
			`URI(URI &&other) = default;`
			`URI &operator=(URI &&other) = default;`

			`/*!`
			`* @brief Download %URI and extract title, description and full text.`
			`*`
			`* @since 0.6.0`
			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`html_extract get();`
namespaced library. 2019-07-27 09:59:43 +02:00
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`/*!`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @brief Save %URI in archive and return archive-URI.`
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @since 0.6.0`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`*/`
			`[[nodiscard]]`
Mark `URI::archive()` const. 2019-10-27 20:38:58 +01:00			`archive_answer archive() const;`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00
			`protected:`
			`string _uri;`
Store document in class variable. 2019-12-11 13:00:43 +01:00			`string _document;`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00
			`/*!`
			`* @brief Make a HTTP(S) request.`
Added error handling to http_extract. 2019-08-06 12:13:27 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @since 0.6.0`
			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`string make_request(const string &uri, bool archive = false) const;`

			`/*!`
			`* @brief Extract the title from an HTML page.`
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @since 0.6.0`
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
Store document in class variable. 2019-12-11 13:00:43 +01:00			`string extract_title() const;`
Added error handling to http_extract. 2019-08-06 12:13:27 +02:00
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`/*!`
			`* @brief Extract the description from an HTML page.`
			`*`
			`* @since 0.6.0`
			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
Store document in class variable. 2019-12-11 13:00:43 +01:00			`string extract_description() const;`
namespaced library. 2019-07-27 09:59:43 +02:00
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00			`/*!`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @brief Removes HTML tags and superflous spaces from an HTML page.`
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @since 0.6.0`
			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
Store document in class variable. 2019-12-11 13:00:43 +01:00			`string strip_html() const;`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00
			`/*!`
			`* @brief Remove HTML tags.`
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @param html HTML page.`
			`* @param tag If set, only remove this tag.`
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @since 0.6.0`
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
			`string remove_html_tags(const string &html, const string &tag = "") const;`
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`/*!`
			`* @brief Convert HTML entities to UTF-8.`
			`*`
			`* @since 0.6.0`
			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
			`string unescape_html(string html) const;`
Added error handling to calls to archive service. 2019-08-06 12:40:52 +02:00
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`/*!`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @brief Replace newlines with spaces.`
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`*`
			`* @since 0.6.0`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
			`string remove_newlines(string text) const;`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00
			`/*!`
			`* @brief Set proxy server.`
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`*`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`* @since 0.8.5`
Display right include paths in Doxygen output. 2019-08-05 23:19:44 +02:00			`*/`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`void set_proxy();`
Cut descriptions at 500 characters. 2019-10-15 15:10:39 +02:00
			`/*!`
			`* @brief Limits text to N characters, cuts at space.`
			`*`
			`* @since 0.8.5`
			`*/`
[[nodiscard]] all the functions. 2019-10-30 08:51:07 +01:00			`[[nodiscard]]`
Remove unnecessary const in function declaration. 2019-10-28 06:46:58 +01:00			`string cut_text(const string &text, uint16_t n_chars) const;`
Store document in class variable. 2019-12-11 13:00:43 +01:00
			`/*!`
			`* @brief Returns true if document is *HTML.`
			`*`
			`* @since 0.9.2`
			`*/`
			`bool is_html() const;`
Changed namespace-indentation and header order. 2019-09-30 13:20:36 +02:00			`};`
Refactored for better readability. Ran clang-tidy over the code, took most of the advice. 2019-09-25 03:58:29 +02:00			`} // namespace remwharead`
Fetch page, extract title, description and full text. 2019-05-15 07:04:12 +02:00
Renamed URL to URI. 2019-05-16 08:36:35 +02:00			`#endif // REMWHAREAD_URI_HPP`