Store document in class variable.

This commit is contained in:
tastytea 2019-12-11 13:00:43 +01:00
parent 0431b4a8ca
commit 7c7d28b7bc
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
3 changed files with 48 additions and 43 deletions

View File

@ -107,6 +107,7 @@ public:
protected:
string _uri;
string _document;
/*!
* @brief Make a HTTP(S) request.
@ -122,7 +123,7 @@ protected:
* @since 0.6.0
*/
[[nodiscard]]
string extract_title(const string &html) const;
string extract_title() const;
/*!
* @brief Extract the description from an HTML page.
@ -130,7 +131,7 @@ protected:
* @since 0.6.0
*/
[[nodiscard]]
string extract_description(const string &html) const;
string extract_description() const;
/*!
* @brief Removes HTML tags and superflous spaces from an HTML page.
@ -138,7 +139,7 @@ protected:
* @since 0.6.0
*/
[[nodiscard]]
string strip_html(const string &html) const;
string strip_html() const;
/*!
* @brief Remove HTML tags.
@ -181,6 +182,13 @@ protected:
*/
[[nodiscard]]
string cut_text(const string &text, uint16_t n_chars) const;
/*!
* @brief Returns true if document is *HTML.
*
* @since 0.9.2
*/
bool is_html() const;
};
} // namespace remwharead

View File

@ -66,7 +66,7 @@ archive_answer::operator bool()
}
URI::URI(string uri)
:_uri(move(uri))
: _uri(move(uri))
{
Poco::Net::initializeSSL();
@ -130,16 +130,16 @@ html_extract URI::get()
{
try
{
const string answer = make_request(_uri);
if (!answer.empty())
_document = make_request(_uri);
if (!_document.empty())
{
return
{
true,
"",
extract_title(answer),
extract_description(answer),
strip_html(answer)
extract_title(),
extract_description(),
strip_html()
};
}
}
@ -224,14 +224,13 @@ string URI::make_request(const string &uri, bool archive) const
}
}
string URI::extract_title(const string &html) const
string URI::extract_title() const
{
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
if (is_html())
{
const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS);
vector<string> matches;
re_title.split(html, matches);
re_title.split(_document, matches);
if (matches.size() >= 2)
{
return remove_newlines(unescape_html(matches[1]));
@ -241,29 +240,28 @@ string URI::extract_title(const string &html) const
return "";
}
string URI::extract_description(const string &html) const
string URI::extract_description() const
{
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
if (is_html())
{
const RegEx re_desc(R"(description"[^>]+content="([^"]+))",
RegEx::RE_CASELESS);
vector<string> matches;
re_desc.split(html, matches);
re_desc.split(_document, matches);
if (matches.size() >= 2)
{
return remove_newlines(cut_text(unescape_html(matches[1]), 500));
return cut_text(remove_newlines(unescape_html(matches[1])), 500);
}
}
return "";
}
string URI::strip_html(const string &html) const
string URI::strip_html() const
{
string out;
out = remove_html_tags(html, "script"); // Remove JavaScript.
out = remove_html_tags(_document, "script"); // Remove JavaScript.
out = remove_html_tags(out, "style"); // Remove CSS.
out = remove_html_tags(out); // Remove tags.
@ -681,4 +679,15 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const
return text;
}
bool URI::is_html() const
{
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
{
return true;
}
return false;
}
} // namespace remwharead

View File

@ -32,41 +32,29 @@ SCENARIO ("URI works correctly")
explicit URITest(const string &)
: URI("") {}
URITest()
: URI("test.html") {}
: URI("test.html")
{
_document =
"<html><head><title>title</title>"
"<meta name=\"description\" content=\"description\" />"
"<body><p>A short <span style=\"\">sentence</span>.</p>"
"</body></head></html>";
}
bool test_title()
{
if (extract_title(_html) == "title")
{
return true;
}
return false;
return (extract_title() == "title");
}
bool test_description()
{
if (extract_description(_html) == "description")
{
return true;
}
return false;
return (extract_description() == "description");
}
bool test_fulltext()
{
if (strip_html(_html) == "titleA short sentence.")
{
return true;
}
return false;
return (strip_html() == "titleA short sentence.");
}
private:
const string _html =
"<html><head><title>title</title>"
"<meta name=\"description\" content=\"description\" />"
"<body><p>A short <span style=\"\">sentence</span>.</p>"
"</body></head></html>";
};
WHEN ("extract_title() is called")