diff --git a/include/uri.hpp b/include/uri.hpp index 8d2c5f1..71b0a57 100644 --- a/include/uri.hpp +++ b/include/uri.hpp @@ -107,6 +107,7 @@ public: protected: string _uri; + string _document; /*! * @brief Make a HTTP(S) request. @@ -122,7 +123,7 @@ protected: * @since 0.6.0 */ [[nodiscard]] - string extract_title(const string &html) const; + string extract_title() const; /*! * @brief Extract the description from an HTML page. @@ -130,7 +131,7 @@ protected: * @since 0.6.0 */ [[nodiscard]] - string extract_description(const string &html) const; + string extract_description() const; /*! * @brief Removes HTML tags and superflous spaces from an HTML page. @@ -138,7 +139,7 @@ protected: * @since 0.6.0 */ [[nodiscard]] - string strip_html(const string &html) const; + string strip_html() const; /*! * @brief Remove HTML tags. @@ -181,6 +182,13 @@ protected: */ [[nodiscard]] string cut_text(const string &text, uint16_t n_chars) const; + + /*! + * @brief Returns true if document is *HTML. + * + * @since 0.9.2 + */ + bool is_html() const; }; } // namespace remwharead diff --git a/src/lib/uri.cpp b/src/lib/uri.cpp index ca11dec..c875a52 100644 --- a/src/lib/uri.cpp +++ b/src/lib/uri.cpp @@ -66,7 +66,7 @@ archive_answer::operator bool() } URI::URI(string uri) - :_uri(move(uri)) + : _uri(move(uri)) { Poco::Net::initializeSSL(); @@ -130,16 +130,16 @@ html_extract URI::get() { try { - const string answer = make_request(_uri); - if (!answer.empty()) + _document = make_request(_uri); + if (!_document.empty()) { return { true, "", - extract_title(answer), - extract_description(answer), - strip_html(answer) + extract_title(), + extract_description(), + strip_html() }; } } @@ -224,14 +224,13 @@ string URI::make_request(const string &uri, bool archive) const } } -string URI::extract_title(const string &html) const +string URI::extract_title() const { - const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); - if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri)) + if (is_html()) { const RegEx re_title("]+)?>([^<]+)", RegEx::RE_CASELESS); vector matches; - re_title.split(html, matches); + re_title.split(_document, matches); if (matches.size() >= 2) { return remove_newlines(unescape_html(matches[1])); @@ -241,29 +240,28 @@ string URI::extract_title(const string &html) const return ""; } -string URI::extract_description(const string &html) const +string URI::extract_description() const { - const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); - if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri)) + if (is_html()) { const RegEx re_desc(R"(description"[^>]+content="([^"]+))", RegEx::RE_CASELESS); vector matches; - re_desc.split(html, matches); + re_desc.split(_document, matches); if (matches.size() >= 2) { - return remove_newlines(cut_text(unescape_html(matches[1]), 500)); + return cut_text(remove_newlines(unescape_html(matches[1])), 500); } } return ""; } -string URI::strip_html(const string &html) const +string URI::strip_html() const { string out; - out = remove_html_tags(html, "script"); // Remove JavaScript. + out = remove_html_tags(_document, "script"); // Remove JavaScript. out = remove_html_tags(out, "style"); // Remove CSS. out = remove_html_tags(out); // Remove tags. @@ -681,4 +679,15 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const return text; } +bool URI::is_html() const +{ + const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); + if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri)) + { + return true; + } + + return false; +} + } // namespace remwharead diff --git a/tests/test_uri.cpp b/tests/test_uri.cpp index 34f403f..743ea26 100644 --- a/tests/test_uri.cpp +++ b/tests/test_uri.cpp @@ -32,41 +32,29 @@ SCENARIO ("URI works correctly") explicit URITest(const string &) : URI("") {} URITest() - : URI("test.html") {} + : URI("test.html") + { + _document = + "title" + "" + "

A short sentence.

" + ""; + } bool test_title() { - if (extract_title(_html) == "title") - { - return true; - } - return false; + return (extract_title() == "title"); } bool test_description() { - if (extract_description(_html) == "description") - { - return true; - } - return false; + return (extract_description() == "description"); } bool test_fulltext() { - if (strip_html(_html) == "titleA short sentence.") - { - return true; - } - return false; + return (strip_html() == "titleA short sentence."); } - - private: - const string _html = - "title" - "" - "

A short sentence.

" - ""; }; WHEN ("extract_title() is called")