From a83a2548c3d1faab210c9e959a7dc23511a1a301 Mon Sep 17 00:00:00 2001 From: tastytea Date: Sat, 13 Jul 2019 18:20:37 +0200 Subject: [PATCH] Remove HTML-encoded newlines in descriptions. We didn't catch newlines encoded as HTML entities before. --- CMakeLists.txt | 2 +- src/uri.cpp | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85a7352..c99330c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required (VERSION 3.2) project(remwharead - VERSION 0.4.2 + VERSION 0.4.3 LANGUAGES CXX ) diff --git a/src/uri.cpp b/src/uri.cpp index 802c369..2dd9646 100644 --- a/src/uri.cpp +++ b/src/uri.cpp @@ -64,7 +64,7 @@ const html_extract URI::get() return { extract_title(answer), - strip_html(extract_description(answer)), + extract_description(answer), strip_html(answer) }; } @@ -106,7 +106,7 @@ const string URI::extract_description(const string &html) smatch match; const regex re("description\"[^>]+content=\"([^\"]+)", icase); regex_search(html, match, re); - return remove_newlines(match[1].str()); + return remove_newlines(strip_html(match[1].str())); } return ""; @@ -509,11 +509,18 @@ const string URI::archive() const string URI::remove_newlines(string text) { - size_t pos = 0; - while ((pos = text.find("\n", pos)) != std::string::npos) + size_t posn = 0; + while ((posn = text.find('\n', posn)) != std::string::npos) { - text.replace(pos, 1, " "); - ++pos; + text.replace(posn, 1, " "); + + size_t posr = posn - 1; + if (text[posr] == '\r') + { + text.replace(posr, 1, " "); + } + ++posn; } + return text; }