remwharead/include/uri.hpp

81 lines
2.3 KiB
C++
Raw Normal View History

/* This file is part of remwharead.
* Copyright © 2019 tastytea <tastytea@tastytea.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
2019-05-16 08:36:35 +02:00
#ifndef REMWHAREAD_URI_HPP
#define REMWHAREAD_URI_HPP
#include <string>
2019-07-27 09:59:43 +02:00
namespace remwharead
{
2019-07-27 09:59:43 +02:00
using std::string;
2019-07-28 02:13:23 +02:00
//! A processed HTML page.
2019-07-27 09:59:43 +02:00
typedef struct html_extract
{
string title;
string description;
string fulltext;
} html_extract;
//! Download, archive and process an %URI.
2019-07-27 09:59:43 +02:00
class URI
{
public:
2019-07-28 02:13:23 +02:00
//! Construct object and set URL.
2019-07-27 09:59:43 +02:00
explicit URI(const string &uri);
~URI();
2019-07-27 09:59:43 +02:00
//! Download %URI and extract title, description and full text.
2019-07-27 09:59:43 +02:00
const html_extract get();
2019-07-28 02:13:23 +02:00
//! Save %URI in archive and return archive-URI.
2019-07-27 09:59:43 +02:00
const string archive();
protected:
string _uri;
//! Make a HTTPS request.
const string https_request(const string &uri) const;
2019-07-28 02:13:23 +02:00
//! Extract the title from an HTML page.
2019-07-27 09:59:43 +02:00
const string extract_title(const string &html);
2019-07-28 02:13:23 +02:00
//! Extract the description from an HTML page.
2019-07-27 09:59:43 +02:00
const string extract_description(const string &html);
2019-07-28 02:13:23 +02:00
//! Removes HTML tags and superflous spaces from an HTML page.
2019-07-27 09:59:43 +02:00
const string strip_html(const string &html);
2019-07-28 02:13:23 +02:00
/*!
* @brief Remove HTML tags.
*
* @param html HTML page.
* @param tag If set, only remove this tag.
*/
2019-07-27 09:59:43 +02:00
const string remove_html_tags(const string &html,
const string &tag = "");
2019-07-28 02:13:23 +02:00
//! Convert HTML entities to UTF-8.
2019-07-27 09:59:43 +02:00
const string unescape_html(const string &html);
2019-07-28 02:13:23 +02:00
2019-07-27 09:59:43 +02:00
//! Replace newlines with spaces.
const string remove_newlines(string text);
};
}
2019-05-16 08:36:35 +02:00
#endif // REMWHAREAD_URI_HPP