Detect file encoding of web page and convert to UTF-8.
continuous-integration/drone/push Build is failing Details

Fixes #6.
This commit is contained in:
tastytea 2019-12-11 13:01:44 +01:00
parent 7c7d28b7bc
commit 6fa611cf42
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
3 changed files with 42 additions and 1 deletions

View File

@ -107,6 +107,7 @@ public:
protected:
string _uri;
string _encoding;
string _document;
/*!
@ -183,6 +184,21 @@ protected:
[[nodiscard]]
string cut_text(const string &text, uint16_t n_chars) const;
/*!
* @brief Converts string to UTF-8.
*
* @since 0.9.2
*/
[[nodiscard]]
inline string to_utf8(const string &str);
/*!
* @brief Try to detect the encoding of the document.
*
* @since 0.9.2
*/
void detect_encoding();
/*!
* @brief Returns true if document is *HTML.
*

View File

@ -5,6 +5,8 @@ find_package(Poco
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML
CONFIG)
find_package(Boost COMPONENTS Locale REQUIRED)
file(GLOB_RECURSE sources_lib *.cpp)
file(GLOB_RECURSE headers_lib ../../include/*.hpp)
@ -22,7 +24,7 @@ target_include_directories(${PROJECT_NAME}
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
target_link_libraries(${PROJECT_NAME}
PRIVATE pthread
PRIVATE pthread Boost::Locale
PUBLIC stdc++fs)
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs.

View File

@ -16,6 +16,7 @@
#include "uri.hpp"
#include "version.hpp"
#include <boost/locale.hpp>
#include <Poco/Environment.h>
#include <Poco/Exception.h>
#include <Poco/Net/HTTPClientSession.h>
@ -131,6 +132,7 @@ html_extract URI::get()
try
{
_document = make_request(_uri);
_document = to_utf8(_document);
if (!_document.empty())
{
return
@ -679,6 +681,27 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const
return text;
}
string URI::to_utf8(const string &str)
{
if (_encoding.empty())
{
detect_encoding();
}
return boost::locale::conv::to_utf<char>(str, _encoding);
}
void URI::detect_encoding()
{
const RegEx re_encoding(R"(<meta.+charset=(.+)[";])", RegEx::RE_CASELESS);
vector<string> matches;
re_encoding.split(_document, matches);
if (matches.size() >= 2)
{
_encoding = matches[1];
}
}
bool URI::is_html() const
{
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);