remwharead/include/uri.hpp

196 lines
4.0 KiB
C++
Raw Normal View History

/* This file is part of remwharead.
* Copyright © 2019 tastytea <tastytea@tastytea.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
2019-05-16 08:36:35 +02:00
#ifndef REMWHAREAD_URI_HPP
#define REMWHAREAD_URI_HPP
2019-10-15 15:10:39 +02:00
#include <cstdint>
#include <string>
2019-07-27 09:59:43 +02:00
namespace remwharead
{
2019-10-15 15:10:39 +02:00
using std::uint16_t;
using std::string;
/*!
* @brief A processed HTML page.
*
* @return true if successful, when cast to bool.
*
* @since 0.7.0
*
* @headerfile uri.hpp remwharead/uri.hpp
*/
struct html_extract
{
bool successful = false;
string error;
string title;
string description;
string fulltext;
explicit operator bool();
};
/*!
* @brief The result of the call to the archive service.
*
* @return true if successful, when cast to bool.
*
* @since 0.7.0
*
* @headerfile uri.hpp remwharead/uri.hpp
*/
struct archive_answer
{
bool successful = false;
string error;
string uri;
explicit operator bool();
};
/*!
* @brief Download, archive and process an %URI.
*
* @since 0.6.0
*
* @headerfile uri.hpp remwharead/uri.hpp
*/
class URI
{
public:
/*!
* @brief Construct object and set URL.
*
* Initializes TLS and sets proxy from the environment variable
* `http_proxy`, if possible.
*
* @since 0.6.0
*/
explicit URI(string uri);
virtual ~URI();
URI(const URI &other) = default;
URI &operator=(const URI &other) = default;
URI(URI &&other) = default;
URI &operator=(URI &&other) = default;
/*!
* @brief Download %URI and extract title, description and full text.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
html_extract get();
2019-07-27 09:59:43 +02:00
/*!
* @brief Save %URI in archive and return archive-URI.
*
* @since 0.6.0
2019-10-30 08:51:07 +01:00
*/
[[nodiscard]]
2019-10-27 20:38:58 +01:00
archive_answer archive() const;
protected:
string _uri;
2019-12-11 13:00:43 +01:00
string _document;
/*!
* @brief Make a HTTP(S) request.
2019-08-06 12:13:27 +02:00
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
string make_request(const string &uri, bool archive = false) const;
/*!
* @brief Extract the title from an HTML page.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
2019-12-11 13:00:43 +01:00
string extract_title() const;
2019-08-06 12:13:27 +02:00
/*!
* @brief Extract the description from an HTML page.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
2019-12-11 13:00:43 +01:00
string extract_description() const;
2019-07-27 09:59:43 +02:00
/*!
* @brief Removes HTML tags and superflous spaces from an HTML page.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
2019-12-11 13:00:43 +01:00
string strip_html() const;
/*!
* @brief Remove HTML tags.
*
* @param html HTML page.
* @param tag If set, only remove this tag.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
string remove_html_tags(const string &html, const string &tag = "") const;
/*!
* @brief Convert HTML entities to UTF-8.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
string unescape_html(string html) const;
/*!
* @brief Replace newlines with spaces.
*
* @since 0.6.0
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
string remove_newlines(string text) const;
/*!
* @brief Set proxy server.
*
* @since 0.8.5
*/
void set_proxy();
2019-10-15 15:10:39 +02:00
/*!
* @brief Limits text to N characters, cuts at space.
*
* @since 0.8.5
*/
2019-10-30 08:51:07 +01:00
[[nodiscard]]
string cut_text(const string &text, uint16_t n_chars) const;
2019-12-11 13:00:43 +01:00
/*!
* @brief Returns true if document is *HTML.
*
* @since 0.9.2
*/
bool is_html() const;
};
} // namespace remwharead
2019-05-16 08:36:35 +02:00
#endif // REMWHAREAD_URI_HPP