Merge branch 'develop' into main

This commit is contained in:
tastytea 2019-12-11 14:33:20 +01:00
commit c68f77262f
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
6 changed files with 100 additions and 53 deletions

View File

@ -24,7 +24,7 @@ steps:
- rm /etc/apt/apt.conf.d/docker-clean - rm /etc/apt/apt.conf.d/docker-clean
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qy g++-7 cmake libpoco-dev asciidoc catch - apt-get install -qy g++-7 cmake libpoco-dev libboost-locale-dev asciidoc catch
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -DWITH_MOZILLA=YES -DWITH_TESTS=YES .. - cmake -DWITH_MOZILLA=YES -DWITH_TESTS=YES ..
- make VERBOSE=1 - make VERBOSE=1
@ -52,7 +52,7 @@ steps:
- gpg --armor --export 0x60c317803a41ba51845e371a1e9377a2ba9ef27f | apt-key add - - gpg --armor --export 0x60c317803a41ba51845e371a1e9377a2ba9ef27f | apt-key add -
- apt-get update -q - apt-get update -q
- apt-get install -qy -t bionic g++-9 - apt-get install -qy -t bionic g++-9
- apt-get install -qy cmake libpoco-dev asciidoc catch - apt-get install -qy cmake libpoco-dev libboost-locale-dev asciidoc catch
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -DWITH_MOZILLA=YES .. - cmake -DWITH_MOZILLA=YES ..
- make VERBOSE=1 - make VERBOSE=1
@ -71,7 +71,7 @@ steps:
- rm /etc/apt/apt.conf.d/docker-clean - rm /etc/apt/apt.conf.d/docker-clean
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qy clang-6.0 cmake libpoco-dev asciidoc catch - apt-get install -qy clang-6.0 cmake libpoco-dev libboost-locale-dev asciidoc catch
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -DWITH_MOZILLA=YES .. - cmake -DWITH_MOZILLA=YES ..
- make VERBOSE=1 - make VERBOSE=1
@ -90,7 +90,7 @@ steps:
- rm /etc/apt/apt.conf.d/docker-clean - rm /etc/apt/apt.conf.d/docker-clean
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qy clang cmake libpoco-dev asciidoc catch - apt-get install -qy clang cmake libpoco-dev libboost-locale-dev asciidoc catch
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -DWITH_MOZILLA=YES .. - cmake -DWITH_MOZILLA=YES ..
- make VERBOSE=1 - make VERBOSE=1
@ -140,7 +140,7 @@ steps:
- rm /etc/apt/apt.conf.d/docker-clean - rm /etc/apt/apt.conf.d/docker-clean
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qy g++ cmake libpoco-dev asciidoc catch - apt-get install -qy g++ cmake libpoco-dev libboost-locale-dev asciidoc catch
- apt-get install -qy build-essential file zip - apt-get install -qy build-essential file zip
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES .. - cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES ..

View File

@ -25,7 +25,8 @@ if(${HUNTER_ENABLED})
# FetchContent_MakeAvailable needs 3.14. # FetchContent_MakeAvailable needs 3.14.
if(NOT (${CMAKE_VERSION} VERSION_LESS 3.14)) if(NOT (${CMAKE_VERSION} VERSION_LESS 3.14))
set(HUNTER_PACKAGES PocoCpp) set(HUNTER_Boost_COMPONENTS locale)
set(HUNTER_PACKAGES PocoCpp Boost)
include(FetchContent) include(FetchContent)
FetchContent_Declare(SetupHunter GIT_REPOSITORY https://github.com/cpp-pm/gate) FetchContent_Declare(SetupHunter GIT_REPOSITORY https://github.com/cpp-pm/gate)
FetchContent_MakeAvailable(SetupHunter) FetchContent_MakeAvailable(SetupHunter)
@ -44,6 +45,7 @@ project(remwharead
if(${HUNTER_ENABLED} AND ${CMAKE_VERSION} VERSION_LESS 3.14) if(${HUNTER_ENABLED} AND ${CMAKE_VERSION} VERSION_LESS 3.14)
hunter_add_package(PocoCpp) hunter_add_package(PocoCpp)
hunter_add_package(Boost COMPONENTS locale)
endif() endif()
set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

View File

@ -107,6 +107,8 @@ public:
protected: protected:
string _uri; string _uri;
string _encoding;
string _document;
/*! /*!
* @brief Make a HTTP(S) request. * @brief Make a HTTP(S) request.
@ -122,7 +124,7 @@ protected:
* @since 0.6.0 * @since 0.6.0
*/ */
[[nodiscard]] [[nodiscard]]
string extract_title(const string &html) const; string extract_title() const;
/*! /*!
* @brief Extract the description from an HTML page. * @brief Extract the description from an HTML page.
@ -130,7 +132,7 @@ protected:
* @since 0.6.0 * @since 0.6.0
*/ */
[[nodiscard]] [[nodiscard]]
string extract_description(const string &html) const; string extract_description() const;
/*! /*!
* @brief Removes HTML tags and superflous spaces from an HTML page. * @brief Removes HTML tags and superflous spaces from an HTML page.
@ -138,7 +140,7 @@ protected:
* @since 0.6.0 * @since 0.6.0
*/ */
[[nodiscard]] [[nodiscard]]
string strip_html(const string &html) const; string strip_html() const;
/*! /*!
* @brief Remove HTML tags. * @brief Remove HTML tags.
@ -181,6 +183,28 @@ protected:
*/ */
[[nodiscard]] [[nodiscard]]
string cut_text(const string &text, uint16_t n_chars) const; string cut_text(const string &text, uint16_t n_chars) const;
/*!
* @brief Converts string to UTF-8.
*
* @since 0.9.2
*/
[[nodiscard]]
inline string to_utf8(const string &str);
/*!
* @brief Try to detect the encoding of the document.
*
* @since 0.9.2
*/
void detect_encoding();
/*!
* @brief Returns true if document is *HTML.
*
* @since 0.9.2
*/
bool is_html() const;
}; };
} // namespace remwharead } // namespace remwharead

View File

@ -1,9 +1,10 @@
include(GNUInstallDirs) include(GNUInstallDirs)
# Some distributions do not contain Poco*Config.cmake recipes. # Some distributions do not contain Poco*Config.cmake recipes.
find_package(Poco find_package(Poco CONFIG
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML)
CONFIG)
find_package(Boost 1.48.0 REQUIRED COMPONENTS locale)
file(GLOB_RECURSE sources_lib *.cpp) file(GLOB_RECURSE sources_lib *.cpp)
file(GLOB_RECURSE headers_lib ../../include/*.hpp) file(GLOB_RECURSE headers_lib ../../include/*.hpp)
@ -22,7 +23,7 @@ target_include_directories(${PROJECT_NAME}
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>") "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
target_link_libraries(${PROJECT_NAME} target_link_libraries(${PROJECT_NAME}
PRIVATE pthread PRIVATE pthread Boost::locale
PUBLIC stdc++fs) PUBLIC stdc++fs)
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs. # If no Poco*Config.cmake recipes are found, look for headers in standard dirs.

View File

@ -16,6 +16,7 @@
#include "uri.hpp" #include "uri.hpp"
#include "version.hpp" #include "version.hpp"
#include <boost/locale.hpp>
#include <Poco/Environment.h> #include <Poco/Environment.h>
#include <Poco/Exception.h> #include <Poco/Exception.h>
#include <Poco/Net/HTTPClientSession.h> #include <Poco/Net/HTTPClientSession.h>
@ -66,7 +67,7 @@ archive_answer::operator bool()
} }
URI::URI(string uri) URI::URI(string uri)
:_uri(move(uri)) : _uri(move(uri))
{ {
Poco::Net::initializeSSL(); Poco::Net::initializeSSL();
@ -130,16 +131,17 @@ html_extract URI::get()
{ {
try try
{ {
const string answer = make_request(_uri); _document = make_request(_uri);
if (!answer.empty()) _document = to_utf8(_document);
if (!_document.empty())
{ {
return return
{ {
true, true,
"", "",
extract_title(answer), extract_title(),
extract_description(answer), extract_description(),
strip_html(answer) strip_html()
}; };
} }
} }
@ -224,14 +226,13 @@ string URI::make_request(const string &uri, bool archive) const
} }
} }
string URI::extract_title(const string &html) const string URI::extract_title() const
{ {
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); if (is_html())
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
{ {
const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS); const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS);
vector<string> matches; vector<string> matches;
re_title.split(html, matches); re_title.split(_document, matches);
if (matches.size() >= 2) if (matches.size() >= 2)
{ {
return remove_newlines(unescape_html(matches[1])); return remove_newlines(unescape_html(matches[1]));
@ -241,29 +242,28 @@ string URI::extract_title(const string &html) const
return ""; return "";
} }
string URI::extract_description(const string &html) const string URI::extract_description() const
{ {
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS); if (is_html())
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
{ {
const RegEx re_desc(R"(description"[^>]+content="([^"]+))", const RegEx re_desc(R"(description"[^>]+content="([^"]+))",
RegEx::RE_CASELESS); RegEx::RE_CASELESS);
vector<string> matches; vector<string> matches;
re_desc.split(html, matches); re_desc.split(_document, matches);
if (matches.size() >= 2) if (matches.size() >= 2)
{ {
return remove_newlines(cut_text(unescape_html(matches[1]), 500)); return cut_text(remove_newlines(unescape_html(matches[1])), 500);
} }
} }
return ""; return "";
} }
string URI::strip_html(const string &html) const string URI::strip_html() const
{ {
string out; string out;
out = remove_html_tags(html, "script"); // Remove JavaScript. out = remove_html_tags(_document, "script"); // Remove JavaScript.
out = remove_html_tags(out, "style"); // Remove CSS. out = remove_html_tags(out, "style"); // Remove CSS.
out = remove_html_tags(out); // Remove tags. out = remove_html_tags(out); // Remove tags.
@ -681,4 +681,36 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const
return text; return text;
} }
string URI::to_utf8(const string &str)
{
if (_encoding.empty())
{
detect_encoding();
}
return boost::locale::conv::to_utf<char>(str, _encoding);
}
void URI::detect_encoding()
{
const RegEx re_encoding(R"(<meta.+charset=(.+)[";])", RegEx::RE_CASELESS);
vector<string> matches;
re_encoding.split(_document, matches);
if (matches.size() >= 2)
{
_encoding = matches[1];
}
}
bool URI::is_html() const
{
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
{
return true;
}
return false;
}
} // namespace remwharead } // namespace remwharead

View File

@ -32,41 +32,29 @@ SCENARIO ("URI works correctly")
explicit URITest(const string &) explicit URITest(const string &)
: URI("") {} : URI("") {}
URITest() URITest()
: URI("test.html") {} : URI("test.html")
{
_document =
"<html><head><title>title</title>"
"<meta name=\"description\" content=\"description\" />"
"<body><p>A short <span style=\"\">sentence</span>.</p>"
"</body></head></html>";
}
bool test_title() bool test_title()
{ {
if (extract_title(_html) == "title") return (extract_title() == "title");
{
return true;
}
return false;
} }
bool test_description() bool test_description()
{ {
if (extract_description(_html) == "description") return (extract_description() == "description");
{
return true;
}
return false;
} }
bool test_fulltext() bool test_fulltext()
{ {
if (strip_html(_html) == "titleA short sentence.") return (strip_html() == "titleA short sentence.");
{
return true;
}
return false;
} }
private:
const string _html =
"<html><head><title>title</title>"
"<meta name=\"description\" content=\"description\" />"
"<body><p>A short <span style=\"\">sentence</span>.</p>"
"</body></head></html>";
}; };
WHEN ("extract_title() is called") WHEN ("extract_title() is called")