bugfix: NO-BREAK SPACE: confused unicode representation with UTF-8 hex representation

This commit is contained in:
tastytea 2018-05-02 20:17:55 +02:00
parent f57ff976e5
commit b25a09608f
Signed by: tastytea
GPG Key ID: 59346E0EA35C67E5
2 changed files with 3 additions and 2 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.7)
project (mastorss
VERSION 0.5.13
VERSION 0.5.14
LANGUAGES CXX
)

View File

@ -166,7 +166,8 @@ std::vector<string> parse_website(const string &xml)
str = std::regex_replace(str, recdata2, "");
str = std::regex_replace(str, restrip, "");
str = std::regex_replace(str, std::regex("\\r"), ""); // remove \r
str = std::regex_replace(str, std::regex("\uc2a0"), " "); // replace NO-BREAK SPACE with space
// replace NO-BREAK SPACE with space (UTF-8: 0xc2a0)
str = std::regex_replace(str, std::regex("\u00a0"), " ");
str = std::regex_replace(str, std::regex("\\n[ \t]+\\n"), ""); // remove whitespace between newlines
str = std::regex_replace(str, std::regex("\\n{3,}"), "\n\n"); // remove excess newlines