Wrap headlines in <H> and </H> during cleanup.

2021-05-24 18:08:40 +02:00 · 2021-05-24 18:08:40 +02:00 · bb4a4c719f
commit bb4a4c719f
parent 8ab7d0f655
2 changed files with 17 additions and 16 deletions
--- a/man/epubgrep.1.adoc
+++ b/man/epubgrep.1.adoc
@ -72,8 +72,7 @@ epubgrep does not operate on lines, but on whole files. This means you can
 search for text spanning multiple lines. All newlines will be replaced by
 spaces.

-HTML will be stripped (except headlines) and newlines will be removed unless
-*--raw* is specified.
+HTML will be stripped newlines will be removed unless *--raw* is specified.

 === Configuration

--- a/src/search.cpp
+++ b/src/search.cpp
@ -20,6 +20,7 @@
 #include "zip.hpp"

 #include <boost/regex.hpp>
+#include <boost/regex/v4/regex_match.hpp>

 #include <algorithm>
 #include <string>
@ -99,18 +100,24 @@ void cleanup_text(std::string &text)
 {
    for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
    {
-        // Don't strip headlines. We need them later on.
-        if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
+        // Mark headlines. We need them later on.
+        std::string replacement;
+        if (boost::regex_match(text.substr(pos, 3), boost::regex{"<[hH][1-6]"}))
        {
-            ++pos;
-            continue;
+            replacement = "<H>";
        }
-        text.erase(pos, text.find('>', pos) + 1 - pos);
+        else if (boost::regex_match(text.substr(pos, 3),
+                                    boost::regex{"</[hH]"}))
+        {
+            replacement = "</H>";
+        }
+        text.replace(pos, text.find('>', pos) + 1 - pos, replacement);
+        pos += replacement.length();
    }

    for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
    {
-        text.replace(pos, 1, "");
+        text.erase(pos, 1);
    }

    for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
@ -154,15 +161,10 @@ context(const boost::match_results<std::string::const_iterator> &match,
 [[nodiscard]] std::string headline(const std::string_view prefix)
 {
    size_t pos{prefix.length()};
-    while ((pos = prefix.rfind("<h", pos)) != std::string_view::npos)
+    while ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
    {
-        if (boost::regex_match(std::string{prefix.substr(pos, 4)},
-                               boost::regex{"<h[1-6][> ]"}))
-        {
-            pos = prefix.find('>', pos) + 1;
-            return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
-        }
-        pos -= 2;
+        pos += 3;
+        return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
    }

    return {};