better regular expressions for cleaning

This commit is contained in:
tastytea 2018-04-20 15:43:23 +02:00
parent b7616e3f44
commit e887cbbf4b
Signed by: tastytea
GPG Key ID: 59346E0EA35C67E5
2 changed files with 5 additions and 4 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.7) cmake_minimum_required (VERSION 3.7)
project (mastorss project (mastorss
VERSION 0.5.7 VERSION 0.5.8
LANGUAGES CXX LANGUAGES CXX
) )

View File

@ -155,9 +155,10 @@ std::vector<string> parse_website(const string &xml)
str = std::regex_replace(str, recdata1, ""); str = std::regex_replace(str, recdata1, "");
str = std::regex_replace(str, recdata2, ""); str = std::regex_replace(str, recdata2, "");
str = std::regex_replace(str, restrip, ""); str = std::regex_replace(str, restrip, "");
str = std::regex_replace(str, std::regex("[\\r\\n] +[\\r\\n]"), "\n\n"); // remove space between newlines str = std::regex_replace(str, std::regex("\\r"), ""); // remove \r
str = std::regex_replace(str, std::regex("[\\r\\n]{3,}"), "\n"); // remove excess newlines str = std::regex_replace(str, std::regex("\\n +\\n"), "\n\n"); // remove space between newlines
str = std::regex_replace(str, std::regex("^[ \t]*$"), ""); // remove nothing str = std::regex_replace(str, std::regex("\\n{3,}"), "\n\n"); // remove excess newlines
str = std::regex_replace(str, std::regex("\\n[ \t]*\\n"), ""); // remove nothing
for (const string &hashtag : watchwords) for (const string &hashtag : watchwords)
{ {