From 7c7d28b7bcee397053fd06a6a2bb9af222daa7b4 Mon Sep 17 00:00:00 2001 From: tastytea Date: Wed, 11 Dec 2019 13:00:43 +0100 Subject: [PATCH 1/4] Store document in class variable. --- include/uri.hpp | 14 +++++++++++--- src/lib/uri.cpp | 43 ++++++++++++++++++++++++++----------------- tests/test_uri.cpp | 34 +++++++++++----------------------- 3 files changed, 48 insertions(+), 43 deletions(-) diff --git a/include/uri.hpp b/include/uri.hpp index 8d2c5f1..71b0a57 100644 --- a/include/uri.hpp +++ b/include/uri.hpp @@ -107,6 +107,7 @@ public: protected: string _uri; + string _document; /*! * @brief Make a HTTP(S) request. @@ -122,7 +123,7 @@ protected: * @since 0.6.0 */ [[nodiscard]] - string extract_title(const string &html) const; + string extract_title() const; /*! * @brief Extract the description from an HTML page. @@ -130,7 +131,7 @@ protected: * @since 0.6.0 */ [[nodiscard]] - string extract_description(const string &html) const; + string extract_description() const; /*! * @brief Removes HTML tags and superflous spaces from an HTML page. @@ -138,7 +139,7 @@ protected: * @since 0.6.0 */ [[nodiscard]] - string strip_html(const string &html) const; + string strip_html() const; /*! * @brief Remove HTML tags. @@ -181,6 +182,13 @@ protected: */ [[nodiscard]] string cut_text(const string &text, uint16_t n_chars) const; + + /*! + * @brief Returns true if document is *HTML. + * + * @since 0.9.2 + */ + bool is_html() const; }; } // namespace remwharead diff --git a/src/lib/uri.cpp b/src/lib/uri.cpp index ca11dec..c875a52 100644 --- a/src/lib/uri.cpp +++ b/src/lib/uri.cpp @@ -66,7 +66,7 @@ archive_answer::operator bool() } URI::URI(string uri) - :_uri(move(uri)) + : _uri(move(uri)) { Poco::Net::initializeSSL(); @@ -130,16 +130,16 @@ html_extract URI::get() { try { - const string answer = make_request(_uri); - if (!answer.empty()) + _document = make_request(_uri); + if (!_document.empty()) { return { true, "", - extract_title(answer), - extract_description(answer), - strip_html(answer) + extract_title(), + extract_description(), + strip_html() }; } } @@ -224,14 +224,13 @@ string URI::make_request(const string &uri, bool archive) const } } -string URI::extract_title(const string &html) const +string URI::extract_title() const { - const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); - if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri)) + if (is_html()) { const RegEx re_title("]+)?>([^<]+)", RegEx::RE_CASELESS); vector matches; - re_title.split(html, matches); + re_title.split(_document, matches); if (matches.size() >= 2) { return remove_newlines(unescape_html(matches[1])); @@ -241,29 +240,28 @@ string URI::extract_title(const string &html) const return ""; } -string URI::extract_description(const string &html) const +string URI::extract_description() const { - const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); - if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri)) + if (is_html()) { const RegEx re_desc(R"(description"[^>]+content="([^"]+))", RegEx::RE_CASELESS); vector matches; - re_desc.split(html, matches); + re_desc.split(_document, matches); if (matches.size() >= 2) { - return remove_newlines(cut_text(unescape_html(matches[1]), 500)); + return cut_text(remove_newlines(unescape_html(matches[1])), 500); } } return ""; } -string URI::strip_html(const string &html) const +string URI::strip_html() const { string out; - out = remove_html_tags(html, "script"); // Remove JavaScript. + out = remove_html_tags(_document, "script"); // Remove JavaScript. out = remove_html_tags(out, "style"); // Remove CSS. out = remove_html_tags(out); // Remove tags. @@ -681,4 +679,15 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const return text; } +bool URI::is_html() const +{ + const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); + if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri)) + { + return true; + } + + return false; +} + } // namespace remwharead diff --git a/tests/test_uri.cpp b/tests/test_uri.cpp index 34f403f..743ea26 100644 --- a/tests/test_uri.cpp +++ b/tests/test_uri.cpp @@ -32,41 +32,29 @@ SCENARIO ("URI works correctly") explicit URITest(const string &) : URI("") {} URITest() - : URI("test.html") {} + : URI("test.html") + { + _document = + "title" + "" + "

A short sentence.

" + ""; + } bool test_title() { - if (extract_title(_html) == "title") - { - return true; - } - return false; + return (extract_title() == "title"); } bool test_description() { - if (extract_description(_html) == "description") - { - return true; - } - return false; + return (extract_description() == "description"); } bool test_fulltext() { - if (strip_html(_html) == "titleA short sentence.") - { - return true; - } - return false; + return (strip_html() == "titleA short sentence."); } - - private: - const string _html = - "title" - "" - "

A short sentence.

" - ""; }; WHEN ("extract_title() is called") From 6fa611cf420549ede7ddd5b27a867eeae1959672 Mon Sep 17 00:00:00 2001 From: tastytea Date: Wed, 11 Dec 2019 13:01:44 +0100 Subject: [PATCH 2/4] Detect file encoding of web page and convert to UTF-8. Fixes #6. --- include/uri.hpp | 16 ++++++++++++++++ src/lib/CMakeLists.txt | 4 +++- src/lib/uri.cpp | 23 +++++++++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/include/uri.hpp b/include/uri.hpp index 71b0a57..d88d128 100644 --- a/include/uri.hpp +++ b/include/uri.hpp @@ -107,6 +107,7 @@ public: protected: string _uri; + string _encoding; string _document; /*! @@ -183,6 +184,21 @@ protected: [[nodiscard]] string cut_text(const string &text, uint16_t n_chars) const; + /*! + * @brief Converts string to UTF-8. + * + * @since 0.9.2 + */ + [[nodiscard]] + inline string to_utf8(const string &str); + + /*! + * @brief Try to detect the encoding of the document. + * + * @since 0.9.2 + */ + void detect_encoding(); + /*! * @brief Returns true if document is *HTML. * diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 09d46f1..87fb524 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -5,6 +5,8 @@ find_package(Poco COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML CONFIG) +find_package(Boost COMPONENTS Locale REQUIRED) + file(GLOB_RECURSE sources_lib *.cpp) file(GLOB_RECURSE headers_lib ../../include/*.hpp) @@ -22,7 +24,7 @@ target_include_directories(${PROJECT_NAME} "$") target_link_libraries(${PROJECT_NAME} - PRIVATE pthread + PRIVATE pthread Boost::Locale PUBLIC stdc++fs) # If no Poco*Config.cmake recipes are found, look for headers in standard dirs. diff --git a/src/lib/uri.cpp b/src/lib/uri.cpp index c875a52..fb0d30c 100644 --- a/src/lib/uri.cpp +++ b/src/lib/uri.cpp @@ -16,6 +16,7 @@ #include "uri.hpp" #include "version.hpp" +#include #include #include #include @@ -131,6 +132,7 @@ html_extract URI::get() try { _document = make_request(_uri); + _document = to_utf8(_document); if (!_document.empty()) { return @@ -679,6 +681,27 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const return text; } +string URI::to_utf8(const string &str) +{ + if (_encoding.empty()) + { + detect_encoding(); + } + + return boost::locale::conv::to_utf(str, _encoding); +} + +void URI::detect_encoding() +{ + const RegEx re_encoding(R"( matches; + re_encoding.split(_document, matches); + if (matches.size() >= 2) + { + _encoding = matches[1]; + } +} + bool URI::is_html() const { const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); From 3889a1b9158cface489d12bafb184d222ab32393 Mon Sep 17 00:00:00 2001 From: tastytea Date: Wed, 11 Dec 2019 13:58:58 +0100 Subject: [PATCH 3/4] Add Boost to drone recipe. --- .drone.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.drone.yml b/.drone.yml index 588375a..b64945d 100644 --- a/.drone.yml +++ b/.drone.yml @@ -24,7 +24,7 @@ steps: - rm /etc/apt/apt.conf.d/docker-clean - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - - apt-get install -qy g++-7 cmake libpoco-dev asciidoc catch + - apt-get install -qy g++-7 cmake libpoco-dev libboost-locale-dev asciidoc catch - rm -rf build && mkdir -p build && cd build - cmake -DWITH_MOZILLA=YES -DWITH_TESTS=YES .. - make VERBOSE=1 @@ -52,7 +52,7 @@ steps: - gpg --armor --export 0x60c317803a41ba51845e371a1e9377a2ba9ef27f | apt-key add - - apt-get update -q - apt-get install -qy -t bionic g++-9 - - apt-get install -qy cmake libpoco-dev asciidoc catch + - apt-get install -qy cmake libpoco-dev libboost-locale-dev asciidoc catch - rm -rf build && mkdir -p build && cd build - cmake -DWITH_MOZILLA=YES .. - make VERBOSE=1 @@ -71,7 +71,7 @@ steps: - rm /etc/apt/apt.conf.d/docker-clean - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - - apt-get install -qy clang-6.0 cmake libpoco-dev asciidoc catch + - apt-get install -qy clang-6.0 cmake libpoco-dev libboost-locale-dev asciidoc catch - rm -rf build && mkdir -p build && cd build - cmake -DWITH_MOZILLA=YES .. - make VERBOSE=1 @@ -90,7 +90,7 @@ steps: - rm /etc/apt/apt.conf.d/docker-clean - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - - apt-get install -qy clang cmake libpoco-dev asciidoc catch + - apt-get install -qy clang cmake libpoco-dev libboost-locale-dev asciidoc catch - rm -rf build && mkdir -p build && cd build - cmake -DWITH_MOZILLA=YES .. - make VERBOSE=1 @@ -140,7 +140,7 @@ steps: - rm /etc/apt/apt.conf.d/docker-clean - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - - apt-get install -qy g++ cmake libpoco-dev asciidoc catch + - apt-get install -qy g++ cmake libpoco-dev libboost-locale-dev asciidoc catch - apt-get install -qy build-essential file zip - rm -rf build && mkdir -p build && cd build - cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES .. From c5dc9d4098c5ed9feb1539602057d289c1051399 Mon Sep 17 00:00:00 2001 From: tastytea Date: Wed, 11 Dec 2019 14:23:04 +0100 Subject: [PATCH 4/4] Add Boost to Hunter config. --- CMakeLists.txt | 4 +++- src/lib/CMakeLists.txt | 9 ++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e1644b3..d477518 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,8 @@ if(${HUNTER_ENABLED}) # FetchContent_MakeAvailable needs 3.14. if(NOT (${CMAKE_VERSION} VERSION_LESS 3.14)) - set(HUNTER_PACKAGES PocoCpp) + set(HUNTER_Boost_COMPONENTS locale) + set(HUNTER_PACKAGES PocoCpp Boost) include(FetchContent) FetchContent_Declare(SetupHunter GIT_REPOSITORY https://github.com/cpp-pm/gate) FetchContent_MakeAvailable(SetupHunter) @@ -44,6 +45,7 @@ project(remwharead if(${HUNTER_ENABLED} AND ${CMAKE_VERSION} VERSION_LESS 3.14) hunter_add_package(PocoCpp) + hunter_add_package(Boost COMPONENTS locale) endif() set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 87fb524..69e6436 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -1,11 +1,10 @@ include(GNUInstallDirs) # Some distributions do not contain Poco*Config.cmake recipes. -find_package(Poco - COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML - CONFIG) +find_package(Poco CONFIG + COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML) -find_package(Boost COMPONENTS Locale REQUIRED) +find_package(Boost 1.48.0 REQUIRED COMPONENTS locale) file(GLOB_RECURSE sources_lib *.cpp) file(GLOB_RECURSE headers_lib ../../include/*.hpp) @@ -24,7 +23,7 @@ target_include_directories(${PROJECT_NAME} "$") target_link_libraries(${PROJECT_NAME} - PRIVATE pthread Boost::Locale + PRIVATE pthread Boost::locale PUBLIC stdc++fs) # If no Poco*Config.cmake recipes are found, look for headers in standard dirs.