Detect file encoding of web page and convert to UTF-8.
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
Fixes #6.
This commit is contained in:
parent
7c7d28b7bc
commit
6fa611cf42
|
@ -107,6 +107,7 @@ public:
|
|||
|
||||
protected:
|
||||
string _uri;
|
||||
string _encoding;
|
||||
string _document;
|
||||
|
||||
/*!
|
||||
|
@ -183,6 +184,21 @@ protected:
|
|||
[[nodiscard]]
|
||||
string cut_text(const string &text, uint16_t n_chars) const;
|
||||
|
||||
/*!
|
||||
* @brief Converts string to UTF-8.
|
||||
*
|
||||
* @since 0.9.2
|
||||
*/
|
||||
[[nodiscard]]
|
||||
inline string to_utf8(const string &str);
|
||||
|
||||
/*!
|
||||
* @brief Try to detect the encoding of the document.
|
||||
*
|
||||
* @since 0.9.2
|
||||
*/
|
||||
void detect_encoding();
|
||||
|
||||
/*!
|
||||
* @brief Returns true if document is *HTML.
|
||||
*
|
||||
|
|
|
@ -5,6 +5,8 @@ find_package(Poco
|
|||
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML
|
||||
CONFIG)
|
||||
|
||||
find_package(Boost COMPONENTS Locale REQUIRED)
|
||||
|
||||
file(GLOB_RECURSE sources_lib *.cpp)
|
||||
file(GLOB_RECURSE headers_lib ../../include/*.hpp)
|
||||
|
||||
|
@ -22,7 +24,7 @@ target_include_directories(${PROJECT_NAME}
|
|||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
|
||||
|
||||
target_link_libraries(${PROJECT_NAME}
|
||||
PRIVATE pthread
|
||||
PRIVATE pthread Boost::Locale
|
||||
PUBLIC stdc++fs)
|
||||
|
||||
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs.
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
#include "uri.hpp"
|
||||
#include "version.hpp"
|
||||
#include <boost/locale.hpp>
|
||||
#include <Poco/Environment.h>
|
||||
#include <Poco/Exception.h>
|
||||
#include <Poco/Net/HTTPClientSession.h>
|
||||
|
@ -131,6 +132,7 @@ html_extract URI::get()
|
|||
try
|
||||
{
|
||||
_document = make_request(_uri);
|
||||
_document = to_utf8(_document);
|
||||
if (!_document.empty())
|
||||
{
|
||||
return
|
||||
|
@ -679,6 +681,27 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const
|
|||
return text;
|
||||
}
|
||||
|
||||
string URI::to_utf8(const string &str)
|
||||
{
|
||||
if (_encoding.empty())
|
||||
{
|
||||
detect_encoding();
|
||||
}
|
||||
|
||||
return boost::locale::conv::to_utf<char>(str, _encoding);
|
||||
}
|
||||
|
||||
void URI::detect_encoding()
|
||||
{
|
||||
const RegEx re_encoding(R"(<meta.+charset=(.+)[";])", RegEx::RE_CASELESS);
|
||||
vector<string> matches;
|
||||
re_encoding.split(_document, matches);
|
||||
if (matches.size() >= 2)
|
||||
{
|
||||
_encoding = matches[1];
|
||||
}
|
||||
}
|
||||
|
||||
bool URI::is_html() const
|
||||
{
|
||||
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
||||
|
|
Loading…
Reference in New Issue
Block a user