remwharead  0.6.3
uri.hpp
1 /* This file is part of remwharead.
2  * Copyright © 2019 tastytea <tastytea@tastytea.de>
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, version 3.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11  * GNU General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program. If not, see <http://www.gnu.org/licenses/>.
15  */
16 
17 #ifndef REMWHAREAD_URI_HPP
18 #define REMWHAREAD_URI_HPP
19 
20 #include <string>
21 #include <curlpp/Easy.hpp>
22 
23 namespace remwharead
24 {
25  using std::string;
26 
28  typedef struct html_extract
29  {
30  string title;
31  string description;
32  string fulltext;
33  } html_extract;
34 
36  class URI
37  {
38  public:
40  explicit URI(const string &uri);
41 
43  const html_extract get();
44 
46  const string archive();
47 
48  protected:
49  string _uri;
50 
52  void set_curlpp_options(curlpp::Easy &request);
53 
55  const string extract_title(const string &html);
56 
58  const string extract_description(const string &html);
59 
61  const string strip_html(const string &html);
62 
69  const string remove_html_tags(const string &html,
70  const string &tag = "");
71 
73  const string unescape_html(const string &html);
74 
76  const string remove_newlines(string text);
77  };
78 }
79 
80 #endif // REMWHAREAD_URI_HPP
A processed HTML page.
Definition: uri.hpp:28
URI(const string &uri)
Construct object and set URL.
Definition: uri.cpp:43
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:137
Definition: search.cpp:23
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:118
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:514
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:175
void set_curlpp_options(curlpp::Easy &request)
Sets common curlpp options.
Definition: uri.cpp:83
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:91
Download, archive and process an URI.
Definition: uri.hpp:36
const string archive()
Save URI in archive and return archive-URI.
Definition: uri.cpp:475
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:104