2019-05-15 07:04:12 +02:00
|
|
|
/* This file is part of remwharead.
|
|
|
|
* Copyright © 2019 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2019-05-16 08:36:35 +02:00
|
|
|
#ifndef REMWHAREAD_URI_HPP
|
|
|
|
#define REMWHAREAD_URI_HPP
|
2019-05-15 07:04:12 +02:00
|
|
|
|
|
|
|
#include <string>
|
|
|
|
|
|
|
|
using std::string;
|
|
|
|
|
|
|
|
typedef struct html_extract
|
|
|
|
{
|
|
|
|
string title;
|
|
|
|
string description;
|
|
|
|
string fulltext;
|
|
|
|
} html_extract;
|
|
|
|
|
2019-05-16 08:36:35 +02:00
|
|
|
class URI
|
2019-05-15 07:04:12 +02:00
|
|
|
{
|
|
|
|
public:
|
2019-05-16 08:36:35 +02:00
|
|
|
explicit URI(const string &uri);
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-05-16 08:36:35 +02:00
|
|
|
//! Download URI and extract title, description and full text.
|
2019-05-15 07:04:12 +02:00
|
|
|
const html_extract get();
|
2019-05-16 08:36:35 +02:00
|
|
|
//! Save URI in archive and return URI.
|
2019-05-15 22:24:11 +02:00
|
|
|
const string archive();
|
2019-05-15 07:04:12 +02:00
|
|
|
|
2019-05-17 03:00:47 +02:00
|
|
|
protected:
|
2019-05-16 08:36:35 +02:00
|
|
|
string _uri;
|
2019-05-15 07:04:12 +02:00
|
|
|
|
|
|
|
const string extract_title(const string &html);
|
|
|
|
const string extract_description(const string &html);
|
|
|
|
const string strip_html(const string &html);
|
2019-05-17 05:43:17 +02:00
|
|
|
//! Remove all HTML tags. If tag is not empty, remove tag and its content.
|
|
|
|
const string remove_html_tags(const string &html, const string &tag = "");
|
2019-05-15 07:04:12 +02:00
|
|
|
const string unescape_html(const string &html);
|
2019-05-16 08:36:35 +02:00
|
|
|
const string remove_newlines(const string &text);
|
2019-05-15 07:04:12 +02:00
|
|
|
};
|
|
|
|
|
2019-05-16 08:36:35 +02:00
|
|
|
#endif // REMWHAREAD_URI_HPP
|