2019-05-15 07:04:12 +02:00
|
|
|
/* This file is part of remwharead.
|
|
|
|
* Copyright © 2019 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2019-05-16 08:36:35 +02:00
|
|
|
#ifndef REMWHAREAD_URI_HPP
|
|
|
|
#define REMWHAREAD_URI_HPP
|
2019-05-15 07:04:12 +02:00
|
|
|
|
|
|
|
#include <string>
|
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
namespace remwharead
|
2019-05-15 07:04:12 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
using std::string;
|
|
|
|
|
2019-08-05 23:19:44 +02:00
|
|
|
/*!
|
|
|
|
* @brief A processed HTML page.
|
|
|
|
*
|
2019-08-06 12:13:27 +02:00
|
|
|
* @return true if successful, when cast to bool.
|
|
|
|
*
|
|
|
|
* @since 0.7.0
|
2019-08-05 23:19:44 +02:00
|
|
|
*
|
|
|
|
* @headerfile uri.hpp remwharead/uri.hpp
|
|
|
|
*/
|
2019-09-29 01:06:27 +02:00
|
|
|
struct html_extract
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
2019-08-06 12:13:27 +02:00
|
|
|
bool successful = false;
|
|
|
|
string error;
|
2019-07-27 09:59:43 +02:00
|
|
|
string title;
|
|
|
|
string description;
|
|
|
|
string fulltext;
|
2019-08-06 12:13:27 +02:00
|
|
|
|
2019-09-25 03:58:29 +02:00
|
|
|
explicit operator bool();
|
|
|
|
};
|
2019-07-27 09:59:43 +02:00
|
|
|
|
2019-08-06 12:40:52 +02:00
|
|
|
/*!
|
|
|
|
* @brief The result of the call to the archive service.
|
|
|
|
*
|
|
|
|
* @return true if successful, when cast to bool.
|
|
|
|
*
|
|
|
|
* @since 0.7.0
|
|
|
|
*
|
|
|
|
* @headerfile uri.hpp remwharead/uri.hpp
|
|
|
|
*/
|
2019-09-29 01:06:27 +02:00
|
|
|
struct archive_answer
|
2019-08-06 12:40:52 +02:00
|
|
|
{
|
|
|
|
bool successful = false;
|
|
|
|
string error;
|
|
|
|
string uri;
|
|
|
|
|
2019-09-25 03:58:29 +02:00
|
|
|
explicit operator bool();
|
|
|
|
};
|
2019-08-06 12:40:52 +02:00
|
|
|
|
2019-08-05 23:19:44 +02:00
|
|
|
/*!
|
|
|
|
* @brief Download, archive and process an %URI.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*
|
|
|
|
* @headerfile uri.hpp remwharead/uri.hpp
|
|
|
|
*/
|
2019-07-27 09:59:43 +02:00
|
|
|
class URI
|
|
|
|
{
|
|
|
|
public:
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Construct object and set URL.
|
|
|
|
*
|
|
|
|
* Initializes TLS and sets proxy from the environment variable
|
|
|
|
* `http_proxy`, if possible.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
explicit URI(string uri);
|
2019-08-07 15:14:00 +02:00
|
|
|
virtual ~URI();
|
2019-07-27 09:59:43 +02:00
|
|
|
|
2019-09-25 03:58:29 +02:00
|
|
|
URI(const URI &other) = default;
|
|
|
|
URI &operator=(const URI &other) = default;
|
|
|
|
URI(URI &&other) = default;
|
|
|
|
URI &operator=(URI &&other) = default;
|
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Download %URI and extract title, description and full text.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
html_extract get();
|
2019-07-28 02:13:23 +02:00
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Save %URI in archive and return archive-URI.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
archive_answer archive();
|
2019-07-27 09:59:43 +02:00
|
|
|
|
|
|
|
protected:
|
|
|
|
string _uri;
|
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Make a HTTP(S) request.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string make_request(const string &uri, bool archive = false) const;
|
2019-07-28 02:13:23 +02:00
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Extract the title from an HTML page.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string extract_title(const string &html);
|
2019-07-28 02:13:23 +02:00
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Extract the description from an HTML page.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string extract_description(const string &html);
|
2019-07-28 02:13:23 +02:00
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Removes HTML tags and superflous spaces from an HTML page.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string strip_html(const string &html);
|
2019-07-28 02:13:23 +02:00
|
|
|
|
|
|
|
/*!
|
|
|
|
* @brief Remove HTML tags.
|
|
|
|
*
|
|
|
|
* @param html HTML page.
|
|
|
|
* @param tag If set, only remove this tag.
|
2019-08-07 20:32:46 +02:00
|
|
|
*
|
|
|
|
* @since 0.6.0
|
2019-07-28 02:13:23 +02:00
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string remove_html_tags(const string &html, const string &tag = "");
|
2019-07-28 02:13:23 +02:00
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Convert HTML entities to UTF-8.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string unescape_html(string html);
|
2019-07-28 02:13:23 +02:00
|
|
|
|
2019-08-07 20:32:46 +02:00
|
|
|
/*!
|
|
|
|
* @brief Replace newlines with spaces.
|
|
|
|
*
|
|
|
|
* @since 0.6.0
|
|
|
|
*/
|
2019-09-25 03:58:29 +02:00
|
|
|
string remove_newlines(string text);
|
|
|
|
|
|
|
|
/*!
|
|
|
|
* @brief Set proxy server.
|
|
|
|
*
|
|
|
|
* @since 0.8.5
|
|
|
|
*/
|
|
|
|
void set_proxy();
|
2019-07-27 09:59:43 +02:00
|
|
|
};
|
2019-09-25 03:58:29 +02:00
|
|
|
} // namespace remwharead
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-05-16 08:36:35 +02:00
|
|
|
#endif // REMWHAREAD_URI_HPP
|