Detect file encoding of web page and convert to UTF-8.
Some checks failed
continuous-integration/drone/push Build is failing
Some checks failed
continuous-integration/drone/push Build is failing
Fixes #6.
This commit is contained in:
parent
7c7d28b7bc
commit
6fa611cf42
|
@ -107,6 +107,7 @@ public:
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
string _uri;
|
string _uri;
|
||||||
|
string _encoding;
|
||||||
string _document;
|
string _document;
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
|
@ -183,6 +184,21 @@ protected:
|
||||||
[[nodiscard]]
|
[[nodiscard]]
|
||||||
string cut_text(const string &text, uint16_t n_chars) const;
|
string cut_text(const string &text, uint16_t n_chars) const;
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* @brief Converts string to UTF-8.
|
||||||
|
*
|
||||||
|
* @since 0.9.2
|
||||||
|
*/
|
||||||
|
[[nodiscard]]
|
||||||
|
inline string to_utf8(const string &str);
|
||||||
|
|
||||||
|
/*!
|
||||||
|
* @brief Try to detect the encoding of the document.
|
||||||
|
*
|
||||||
|
* @since 0.9.2
|
||||||
|
*/
|
||||||
|
void detect_encoding();
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* @brief Returns true if document is *HTML.
|
* @brief Returns true if document is *HTML.
|
||||||
*
|
*
|
||||||
|
|
|
@ -5,6 +5,8 @@ find_package(Poco
|
||||||
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML
|
COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML
|
||||||
CONFIG)
|
CONFIG)
|
||||||
|
|
||||||
|
find_package(Boost COMPONENTS Locale REQUIRED)
|
||||||
|
|
||||||
file(GLOB_RECURSE sources_lib *.cpp)
|
file(GLOB_RECURSE sources_lib *.cpp)
|
||||||
file(GLOB_RECURSE headers_lib ../../include/*.hpp)
|
file(GLOB_RECURSE headers_lib ../../include/*.hpp)
|
||||||
|
|
||||||
|
@ -22,7 +24,7 @@ target_include_directories(${PROJECT_NAME}
|
||||||
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
|
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
|
||||||
|
|
||||||
target_link_libraries(${PROJECT_NAME}
|
target_link_libraries(${PROJECT_NAME}
|
||||||
PRIVATE pthread
|
PRIVATE pthread Boost::Locale
|
||||||
PUBLIC stdc++fs)
|
PUBLIC stdc++fs)
|
||||||
|
|
||||||
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs.
|
# If no Poco*Config.cmake recipes are found, look for headers in standard dirs.
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
#include "uri.hpp"
|
#include "uri.hpp"
|
||||||
#include "version.hpp"
|
#include "version.hpp"
|
||||||
|
#include <boost/locale.hpp>
|
||||||
#include <Poco/Environment.h>
|
#include <Poco/Environment.h>
|
||||||
#include <Poco/Exception.h>
|
#include <Poco/Exception.h>
|
||||||
#include <Poco/Net/HTTPClientSession.h>
|
#include <Poco/Net/HTTPClientSession.h>
|
||||||
|
@ -131,6 +132,7 @@ html_extract URI::get()
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
_document = make_request(_uri);
|
_document = make_request(_uri);
|
||||||
|
_document = to_utf8(_document);
|
||||||
if (!_document.empty())
|
if (!_document.empty())
|
||||||
{
|
{
|
||||||
return
|
return
|
||||||
|
@ -679,6 +681,27 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
string URI::to_utf8(const string &str)
|
||||||
|
{
|
||||||
|
if (_encoding.empty())
|
||||||
|
{
|
||||||
|
detect_encoding();
|
||||||
|
}
|
||||||
|
|
||||||
|
return boost::locale::conv::to_utf<char>(str, _encoding);
|
||||||
|
}
|
||||||
|
|
||||||
|
void URI::detect_encoding()
|
||||||
|
{
|
||||||
|
const RegEx re_encoding(R"(<meta.+charset=(.+)[";])", RegEx::RE_CASELESS);
|
||||||
|
vector<string> matches;
|
||||||
|
re_encoding.split(_document, matches);
|
||||||
|
if (matches.size() >= 2)
|
||||||
|
{
|
||||||
|
_encoding = matches[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool URI::is_html() const
|
bool URI::is_html() const
|
||||||
{
|
{
|
||||||
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user