diff --git a/include/uri.hpp b/include/uri.hpp index 71b0a57..d88d128 100644 --- a/include/uri.hpp +++ b/include/uri.hpp @@ -107,6 +107,7 @@ public: protected: string _uri; + string _encoding; string _document; /*! @@ -183,6 +184,21 @@ protected: [[nodiscard]] string cut_text(const string &text, uint16_t n_chars) const; + /*! + * @brief Converts string to UTF-8. + * + * @since 0.9.2 + */ + [[nodiscard]] + inline string to_utf8(const string &str); + + /*! + * @brief Try to detect the encoding of the document. + * + * @since 0.9.2 + */ + void detect_encoding(); + /*! * @brief Returns true if document is *HTML. * diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 09d46f1..87fb524 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -5,6 +5,8 @@ find_package(Poco COMPONENTS Foundation Net NetSSL Data DataSQLite JSON XML CONFIG) +find_package(Boost COMPONENTS Locale REQUIRED) + file(GLOB_RECURSE sources_lib *.cpp) file(GLOB_RECURSE headers_lib ../../include/*.hpp) @@ -22,7 +24,7 @@ target_include_directories(${PROJECT_NAME} "$") target_link_libraries(${PROJECT_NAME} - PRIVATE pthread + PRIVATE pthread Boost::Locale PUBLIC stdc++fs) # If no Poco*Config.cmake recipes are found, look for headers in standard dirs. diff --git a/src/lib/uri.cpp b/src/lib/uri.cpp index c875a52..fb0d30c 100644 --- a/src/lib/uri.cpp +++ b/src/lib/uri.cpp @@ -16,6 +16,7 @@ #include "uri.hpp" #include "version.hpp" +#include #include #include #include @@ -131,6 +132,7 @@ html_extract URI::get() try { _document = make_request(_uri); + _document = to_utf8(_document); if (!_document.empty()) { return @@ -679,6 +681,27 @@ string URI::cut_text(const string &text, const uint16_t n_chars) const return text; } +string URI::to_utf8(const string &str) +{ + if (_encoding.empty()) + { + detect_encoding(); + } + + return boost::locale::conv::to_utf(str, _encoding); +} + +void URI::detect_encoding() +{ + const RegEx re_encoding(R"( matches; + re_encoding.split(_document, matches); + if (matches.size() >= 2) + { + _encoding = matches[1]; + } +} + bool URI::is_html() const { const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);