Merge branch 'develop' into main
This commit is contained in:
commit
c68f77262f
10
.drone.yml
10
.drone.yml
|
@ -24,7 +24,7 @@ steps:
|
||||||
- rm /etc/apt/apt.conf.d/docker-clean
|
- rm /etc/apt/apt.conf.d/docker-clean
|
||||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||||
- apt-get update -q
|
- apt-get update -q
|
||||||
- apt-get install -qy g++-7 cmake libpoco-dev asciidoc catch
|
- apt-get install -qy g++-7 cmake libpoco-dev libboost-locale-dev asciidoc catch
|
||||||
- rm -rf build && mkdir -p build && cd build
|
- rm -rf build && mkdir -p build && cd build
|
||||||
- cmake -DWITH_MOZILLA=YES -DWITH_TESTS=YES ..
|
- cmake -DWITH_MOZILLA=YES -DWITH_TESTS=YES ..
|
||||||
- make VERBOSE=1
|
- make VERBOSE=1
|
||||||
|
@ -52,7 +52,7 @@ steps:
|
||||||
- gpg --armor --export 0x60c317803a41ba51845e371a1e9377a2ba9ef27f | apt-key add -
|
- gpg --armor --export 0x60c317803a41ba51845e371a1e9377a2ba9ef27f | apt-key add -
|
||||||
- apt-get update -q
|
- apt-get update -q
|
||||||
- apt-get install -qy -t bionic g++-9
|
- apt-get install -qy -t bionic g++-9
|
||||||
- apt-get install -qy cmake libpoco-dev asciidoc catch
|
- apt-get install -qy cmake libpoco-dev libboost-locale-dev asciidoc catch
|
||||||
- rm -rf build && mkdir -p build && cd build
|
- rm -rf build && mkdir -p build && cd build
|
||||||
- cmake -DWITH_MOZILLA=YES ..
|
- cmake -DWITH_MOZILLA=YES ..
|
||||||
- make VERBOSE=1
|
- make VERBOSE=1
|
||||||
|
@ -71,7 +71,7 @@ steps:
|
||||||
- rm /etc/apt/apt.conf.d/docker-clean
|
- rm /etc/apt/apt.conf.d/docker-clean
|
||||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||||
- apt-get update -q
|
- apt-get update -q
|
||||||
- apt-get install -qy clang-6.0 cmake libpoco-dev asciidoc catch
|
- apt-get install -qy clang-6.0 cmake libpoco-dev libboost-locale-dev asciidoc catch
|
||||||
- rm -rf build && mkdir -p build && cd build
|
- rm -rf build && mkdir -p build && cd build
|
||||||
- cmake -DWITH_MOZILLA=YES ..
|
- cmake -DWITH_MOZILLA=YES ..
|
||||||
- make VERBOSE=1
|
- make VERBOSE=1
|
||||||
|
@ -90,7 +90,7 @@ steps:
|
||||||
- rm /etc/apt/apt.conf.d/docker-clean
|
- rm /etc/apt/apt.conf.d/docker-clean
|
||||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||||
- apt-get update -q
|
- apt-get update -q
|
||||||
- apt-get install -qy clang cmake libpoco-dev asciidoc catch
|
- apt-get install -qy clang cmake libpoco-dev libboost-locale-dev asciidoc catch
|
||||||
- rm -rf build && mkdir -p build && cd build
|
- rm -rf build && mkdir -p build && cd build
|
||||||
- cmake -DWITH_MOZILLA=YES ..
|
- cmake -DWITH_MOZILLA=YES ..
|
||||||
- make VERBOSE=1
|
- make VERBOSE=1
|
||||||
|
@ -140,7 +140,7 @@ steps:
|
||||||
- rm /etc/apt/apt.conf.d/docker-clean
|
- rm /etc/apt/apt.conf.d/docker-clean
|
||||||
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
|
||||||
- apt-get update -q
|
- apt-get update -q
|
||||||
- apt-get install -qy g++ cmake libpoco-dev asciidoc catch
|
- apt-get install -qy g++ cmake libpoco-dev libboost-locale-dev asciidoc catch
|
||||||
- apt-get install -qy build-essential file zip
|
- apt-get install -qy build-essential file zip
|
||||||
- rm -rf build && mkdir -p build && cd build
|
- rm -rf build && mkdir -p build && cd build
|
||||||
- cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES ..
|
- cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES ..
|
||||||
|
|
|
@ -25,7 +25,8 @@ if(${HUNTER_ENABLED})
|
||||||
|
|
||||||
# FetchContent_MakeAvailable needs 3.14.
|
# FetchContent_MakeAvailable needs 3.14.
|
||||||
if(NOT (${CMAKE_VERSION} VERSION_LESS 3.14))
|
if(NOT (${CMAKE_VERSION} VERSION_LESS 3.14))
|
||||||
set(HUNTER_PACKAGES PocoCpp)
|
set(HUNTER_Boost_COMPONENTS locale)
|
||||||
|
set(HUNTER_PACKAGES PocoCpp Boost)
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
FetchContent_Declare(SetupHunter GIT_REPOSITORY https://github.com/cpp-pm/gate)
|
FetchContent_Declare(SetupHunter GIT_REPOSITORY https://github.com/cpp-pm/gate)
|
||||||
FetchContent_MakeAvailable(SetupHunter)
|
FetchContent_MakeAvailable(SetupHunter)
|
||||||
|
@ -44,6 +45,7 @@ project(remwharead
|
||||||
|
|
||||||
if(${HUNTER_ENABLED} AND ${CMAKE_VERSION} VERSION_LESS 3.14)
|
if(${HUNTER_ENABLED} AND ${CMAKE_VERSION} VERSION_LESS 3.14)
|
||||||
hunter_add_package(PocoCpp)
|
hunter_add_package(PocoCpp)
|
||||||
|
hunter_add_package(Boost COMPONENTS locale)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
|
set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
|
||||||
|
|
|
@ -107,6 +107,8 @@ public:
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
string _uri;
|
string _uri;
|
||||||
|
string _encoding;
|
||||||
|
string _document;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* @brief Make a HTTP(S) request.
|
* @brief Make a HTTP(S) request.
|
||||||
|
@ -122,7 +124,7 @@ protected:
|
||||||
* @since 0.6.0
|
* @since 0.6.0
|
||||||
*/
|
*/
|
||||||
[[nodiscard]]
|
[[nodiscard]]
|
||||||
string extract_title(const string &html) const;
|
string extract_title() const;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* @brief Extract the description from an HTML page.
|
* @brief Extract the description from an HTML page.
|
||||||
|
@ -130,7 +132,7 @@ protected:
|
||||||
* @since 0.6.0
|
* @since 0.6.0
|
||||||
*/
|
*/
|
||||||
[[nodiscard]]
|
[[nodiscard]]
|
||||||
string extract_description(const string &html) const;
|
string extract_description() const;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* @brief Removes HTML tags and superflous spaces from an HTML page.
|
* @brief Removes HTML tags and superflous spaces from an HTML page.
|
||||||
|
@ -138,7 +140,7 @@ protected:
|
||||||
* @since 0.6.0
|
* @since 0.6.0
|
||||||
*/
|
*/
|
||||||
[[nodiscard]]
|
[[nodiscard]]
|
||||||
string strip_html(const string &html) const;
|
string strip_html() const;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* @brief Remove HTML tags.
|
* @brief Remove HTML tags.
|
||||||
|
@ -181,6 +183,28 @@ protected:
|
||||||
*/
|
*/
|
||||||
[[nodiscard]]
|
[[nodiscard]]
|
||||||
string cut_text(const string &text, uint16_t n_chars) const;
|
string cut_text(const string &text, uint16_t n_chars) const;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* @brief Converts string to UTF-8.
|
||||||
|
*
|
||||||
|
* @since 0.9.2
|
||||||
|
*/
|
||||||
|
[[nodiscard]]
|
||||||
|
inline string to_utf8(const string &str);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* @brief Try to detect the encoding of the document.
|
||||||
|
*
|
||||||
|
* @since 0.9.2
|
||||||
|
*/
|
||||||
|
void detect_encoding();
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* @brief Returns true if document is *HTML.
|
||||||
|
*
|
||||||
|
* @since 0.9.2
|
||||||
|
*/
|
||||||
|
bool is_html() const;
|
||||||
};
|
};
|
||||||
} // namespace remwharead
|
} // namespace remwharead
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
|
|
||||||
# Some distributions do not contain Poco*Config.cmake recipes.
|
# Some distributions do not contain Poco*Config.cmake recipes.
|
||||||
find_package(Poco
|
find_package(Poco CONFIG
|
||||||
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML
|
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML)
|
||||||
CONFIG)
|
|
||||||
|
find_package(Boost 1.48.0 REQUIRED COMPONENTS locale)
|
||||||
|
|
||||||
file(GLOB_RECURSE sources_lib *.cpp)
|
file(GLOB_RECURSE sources_lib *.cpp)
|
||||||
file(GLOB_RECURSE headers_lib ../../include/*.hpp)
|
file(GLOB_RECURSE headers_lib ../../include/*.hpp)
|
||||||
|
@ -22,7 +23,7 @@ target_include_directories(${PROJECT_NAME}
|
||||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
|
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
|
||||||
|
|
||||||
target_link_libraries(${PROJECT_NAME}
|
target_link_libraries(${PROJECT_NAME}
|
||||||
PRIVATE pthread
|
PRIVATE pthread Boost::locale
|
||||||
PUBLIC stdc++fs)
|
PUBLIC stdc++fs)
|
||||||
|
|
||||||
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs.
|
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs.
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
#include "uri.hpp"
|
#include "uri.hpp"
|
||||||
#include "version.hpp"
|
#include "version.hpp"
|
||||||
|
#include <boost/locale.hpp>
|
||||||
#include <Poco/Environment.h>
|
#include <Poco/Environment.h>
|
||||||
#include <Poco/Exception.h>
|
#include <Poco/Exception.h>
|
||||||
#include <Poco/Net/HTTPClientSession.h>
|
#include <Poco/Net/HTTPClientSession.h>
|
||||||
|
@ -66,7 +67,7 @@ archive_answer::operator bool()
|
||||||
}
|
}
|
||||||
|
|
||||||
URI::URI(string uri)
|
URI::URI(string uri)
|
||||||
:_uri(move(uri))
|
: _uri(move(uri))
|
||||||
{
|
{
|
||||||
Poco::Net::initializeSSL();
|
Poco::Net::initializeSSL();
|
||||||
|
|
||||||
|
@ -130,16 +131,17 @@ html_extract URI::get()
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
const string answer = make_request(_uri);
|
_document = make_request(_uri);
|
||||||
if (!answer.empty())
|
_document = to_utf8(_document);
|
||||||
|
if (!_document.empty())
|
||||||
{
|
{
|
||||||
return
|
return
|
||||||
{
|
{
|
||||||
true,
|
true,
|
||||||
"",
|
"",
|
||||||
extract_title(answer),
|
extract_title(),
|
||||||
extract_description(answer),
|
extract_description(),
|
||||||
strip_html(answer)
|
strip_html()
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -224,14 +226,13 @@ string URI::make_request(const string &uri, bool archive) const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
string URI::extract_title(const string &html) const
|
string URI::extract_title() const
|
||||||
{
|
{
|
||||||
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
if (is_html())
|
||||||
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
|
|
||||||
{
|
{
|
||||||
const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS);
|
const RegEx re_title("<title(?: [^>]+)?>([^<]+)", RegEx::RE_CASELESS);
|
||||||
vector<string> matches;
|
vector<string> matches;
|
||||||
re_title.split(html, matches);
|
re_title.split(_document, matches);
|
||||||
if (matches.size() >= 2)
|
if (matches.size() >= 2)
|
||||||
{
|
{
|
||||||
return remove_newlines(unescape_html(matches[1]));
|
return remove_newlines(unescape_html(matches[1]));
|
||||||
|
@ -241,29 +242,28 @@ string URI::extract_title(const string &html) const
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
string URI::extract_description(const string &html) const
|
string URI::extract_description() const
|
||||||
{
|
{
|
||||||
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
if (is_html())
|
||||||
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
|
|
||||||
{
|
{
|
||||||
const RegEx re_desc(R"(description"[^>]+content="([^"]+))",
|
const RegEx re_desc(R"(description"[^>]+content="([^"]+))",
|
||||||
RegEx::RE_CASELESS);
|
RegEx::RE_CASELESS);
|
||||||
vector<string> matches;
|
vector<string> matches;
|
||||||
re_desc.split(html, matches);
|
re_desc.split(_document, matches);
|
||||||
if (matches.size() >= 2)
|
if (matches.size() >= 2)
|
||||||
{
|
{
|
||||||
return remove_newlines(cut_text(unescape_html(matches[1]), 500));
|
return cut_text(remove_newlines(unescape_html(matches[1])), 500);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
string URI::strip_html(const string &html) const
|
string URI::strip_html() const
|
||||||
{
|
{
|
||||||
string out;
|
string out;
|
||||||
|
|
||||||
out = remove_html_tags(html, "script"); // Remove JavaScript.
|
out = remove_html_tags(_document, "script"); // Remove JavaScript.
|
||||||
out = remove_html_tags(out, "style"); // Remove CSS.
|
out = remove_html_tags(out, "style"); // Remove CSS.
|
||||||
out = remove_html_tags(out); // Remove tags.
|
out = remove_html_tags(out); // Remove tags.
|
||||||
|
|
||||||
|
@ -681,4 +681,36 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string URI::to_utf8(const string &str)
|
||||||
|
{
|
||||||
|
if (_encoding.empty())
|
||||||
|
{
|
||||||
|
detect_encoding();
|
||||||
|
}
|
||||||
|
|
||||||
|
return boost::locale::conv::to_utf<char>(str, _encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
void URI::detect_encoding()
|
||||||
|
{
|
||||||
|
const RegEx re_encoding(R"(<meta.+charset=(.+)[";])", RegEx::RE_CASELESS);
|
||||||
|
vector<string> matches;
|
||||||
|
re_encoding.split(_document, matches);
|
||||||
|
if (matches.size() >= 2)
|
||||||
|
{
|
||||||
|
_encoding = matches[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool URI::is_html() const
|
||||||
|
{
|
||||||
|
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
||||||
|
if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace remwharead
|
} // namespace remwharead
|
||||||
|
|
|
@ -32,41 +32,29 @@ SCENARIO ("URI works correctly")
|
||||||
explicit URITest(const string &)
|
explicit URITest(const string &)
|
||||||
: URI("") {}
|
: URI("") {}
|
||||||
URITest()
|
URITest()
|
||||||
: URI("test.html") {}
|
: URI("test.html")
|
||||||
|
{
|
||||||
|
_document =
|
||||||
|
"<html><head><title>title</title>"
|
||||||
|
"<meta name=\"description\" content=\"description\" />"
|
||||||
|
"<body><p>A short <span style=\"\">sentence</span>.</p>"
|
||||||
|
"</body></head></html>";
|
||||||
|
}
|
||||||
|
|
||||||
bool test_title()
|
bool test_title()
|
||||||
{
|
{
|
||||||
if (extract_title(_html) == "title")
|
return (extract_title() == "title");
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool test_description()
|
bool test_description()
|
||||||
{
|
{
|
||||||
if (extract_description(_html) == "description")
|
return (extract_description() == "description");
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool test_fulltext()
|
bool test_fulltext()
|
||||||
{
|
{
|
||||||
if (strip_html(_html) == "titleA short sentence.")
|
return (strip_html() == "titleA short sentence.");
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
|
||||||
const string _html =
|
|
||||||
"<html><head><title>title</title>"
|
|
||||||
"<meta name=\"description\" content=\"description\" />"
|
|
||||||
"<body><p>A short <span style=\"\">sentence</span>.</p>"
|
|
||||||
"</body></head></html>";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
WHEN ("extract_title() is called")
|
WHEN ("extract_title() is called")
|
||||||
|
|
Loading…
Reference in New Issue