remwharead  0.9.1
uri.hpp
1 /* This file is part of remwharead.
2  * Copyright © 2019 tastytea <tastytea@tastytea.de>
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, version 3.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program. If not, see <http://www.gnu.org/licenses/>.
15  */
16 
17 #ifndef REMWHAREAD_URI_HPP
18 #define REMWHAREAD_URI_HPP
19 
20 #include <cstdint>
21 #include <string>
22 
23 namespace remwharead
24 {
25 using std::uint16_t;
26 using std::string;
27 
38 {
39  bool successful = false;
40  string error;
41  string title;
42  string description;
43  string fulltext;
44 
45  explicit operator bool();
46 };
47 
58 {
59  bool successful = false;
60  string error;
61  string uri;
62 
63  explicit operator bool();
64 };
65 
73 class URI
74 {
75 public:
84  explicit URI(string uri);
85  virtual ~URI();
86 
87  URI(const URI &other) = default;
88  URI &operator=(const URI &other) = default;
89  URI(URI &&other) = default;
90  URI &operator=(URI &&other) = default;
91 
97  [[nodiscard]]
98  html_extract get();
99 
105  [[nodiscard]]
106  archive_answer archive() const;
107 
108 protected:
109  string _uri;
110 
116  [[nodiscard]]
117  string make_request(const string &uri, bool archive = false) const;
118 
124  [[nodiscard]]
125  string extract_title(const string &html) const;
126 
132  [[nodiscard]]
133  string extract_description(const string &html) const;
134 
140  [[nodiscard]]
141  string strip_html(const string &html) const;
142 
151  [[nodiscard]]
152  string remove_html_tags(const string &html, const string &tag = "") const;
153 
159  [[nodiscard]]
160  string unescape_html(string html) const;
161 
167  [[nodiscard]]
168  string remove_newlines(string text) const;
169 
175  void set_proxy();
176 
182  [[nodiscard]]
183  string cut_text(const string &text, uint16_t n_chars) const;
184 };
185 } // namespace remwharead
186 
187 #endif // REMWHAREAD_URI_HPP
html_extract get()
Download URI and extract title, description and full text.
Definition: uri.cpp:129
URI(string uri)
Construct object and set URL.
Definition: uri.cpp:68
A processed HTML page.
Definition: uri.hpp:37
string unescape_html(string html) const
Convert HTML entities to UTF-8.
void set_proxy()
Set proxy server.
Definition: uri.cpp:76
string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:154
string remove_newlines(string text) const
Replace newlines with spaces.
Definition: uri.cpp:645
string remove_html_tags(const string &html, const string &tag="") const
Remove HTML tags.
string strip_html(const string &html) const
Removes HTML tags and superflous spaces from an HTML page.
string extract_title(const string &html) const
Extract the title from an HTML page.
Definition: uri.cpp:227
The result of the call to the archive service.
Definition: uri.hpp:57
string cut_text(const string &text, uint16_t n_chars) const
Limits text to N characters, cuts at space.
Definition: uri.cpp:663
Download, archive and process an URI.
Definition: uri.hpp:73
archive_answer archive() const
Save URI in archive and return archive-URI.
string extract_description(const string &html) const
Extract the description from an HTML page.
Definition: uri.cpp:244