Don't inject page numbers and headline-markers into the text.

The metadata is recorded in position → data pairs. Closes: #13
2021-06-06 21:26:09 +02:00 · 2021-06-06 21:26:09 +02:00 · b8431019b7
commit b8431019b7
parent ebb8b63830
2 changed files with 91 additions and 82 deletions
--- a/src/search.cpp
+++ b/src/search.cpp
@ -27,6 +27,8 @@

 #include <algorithm>
 #include <array>
+#include <iterator>
+#include <memory>
 #include <string>
 #include <string_view>
 #include <vector>
@ -83,18 +85,18 @@ std::vector<match> search(const fs::path &filepath,
    for (const auto &entry : epub_filepaths)
    {
        DEBUGLOG << "Processing " << entry;
-        auto document{zip::read_file(filepath, entry)};
-        if (!opts.raw)
+        file_in_epub file;
        {
-            cleanup_text(document);
-            document = helpers::unescape_html(document);
+            const auto document{zip::read_file(filepath, entry)};
+            if (!opts.raw)
+            {
+                file = cleanup_text(helpers::unescape_html(document));
+            }
        }

-        string::const_iterator begin{document.begin()};
-        string::const_iterator end{document.end()};
+        string::const_iterator begin{file.text.begin()};
+        string::const_iterator end{file.text.end()};
        boost::match_results<string::const_iterator> match_result;
-        string last_headline;
-        string last_page;

        while (boost::regex_search(begin, end, match_result, re,
                                   boost::match_default))
@ -104,18 +106,10 @@ std::vector<match> search(const fs::path &filepath,
            match.filepath_inside = entry;
            match.text = match_result[0];
            match.context = context(match_result, opts.context);
-            const auto current_headline{headline(match_result.prefix().str())};
-            if (!current_headline.empty())
-            {
-                last_headline = current_headline;
-            }
-            match.headline = last_headline;
-            const auto current_page{page(match_result.prefix().str())};
-            if (!current_page.empty())
-            {
-                last_page = current_page;
-            }
-            match.page = last_page;
+            const auto pos = static_cast<size_t>(
+                std::distance(begin, match_result[0].begin()));
+            match.headline = headline(file, pos);
+            match.page = page(file, pos);

            matches.emplace_back(match);
            begin = match_result[0].second;
@ -125,75 +119,84 @@ std::vector<match> search(const fs::path &filepath,
    return matches;
 }

-void cleanup_text(string &text)
+file_in_epub cleanup_text(const std::string_view text)
 {
+    string output{text};
    static const boost::regex re_header_start{"<[hH][1-6]"};
    static const boost::regex re_header_end{"</[hH][1-6]"};
    static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
                                           "(title|aria-label)"
                                           "=\"([[:alnum:]]+)\""};

-    size_t pos{};
-    while ((pos = text.find('<', pos)) != string::npos)
+    // TODO: Make this more efficient, 3 → 1;
+    size_t pos{0};
+    while ((pos = output.find('\r', pos)) != string::npos)
    {
-        auto endpos{text.find('>', pos) + 1};
+        output.erase(pos, 1);
+    }

-        // Mark headlines. We need them later on.
-        string replacement;
-        if (boost::regex_match(text.substr(pos, 3), re_header_start))
+    pos = 0;
+    while ((pos = output.find_first_of("\n\t", pos)) != string::npos)
+    {
+        output.replace(pos, 1, " ");
+    }
+
+    pos = 0;
+    while ((pos = output.find("  ", pos)) != string::npos)
+    {
+        output.replace(pos, 2, " ");
+    }
+
+    pos = 0;
+    file_in_epub file;
+    size_t headline_start{string::npos};
+    while ((pos = output.find('<', pos)) != string::npos)
+    {
+        auto endpos{output.find('>', pos) + 1};
+
+        if (boost::regex_match(output.substr(pos, 3), re_header_start))
        {
-            replacement = "<H>";
+            headline_start = pos;
        }
-        else if (boost::regex_match(text.substr(pos, 4), re_header_end))
+        else if (boost::regex_match(output.substr(pos, 4), re_header_end))
        {
-            replacement = "</H>";
+            if (headline_start != string::npos)
+            {
+                file.headlines.insert(
+                    {headline_start,
+                     output.substr(headline_start, pos - headline_start)});
+                headline_start = string::npos;
+            }
        }
-        else if (text.substr(pos, 6) == "<span ")
+        else if (output.substr(pos, 6) == "<span ")
        {
            boost::match_results<string::const_iterator> match;
            using it_size_t = string::const_iterator::difference_type;
-            string::const_iterator begin{text.begin()
+            string::const_iterator begin{output.begin()
                                         + static_cast<it_size_t>(pos)};
-            string::const_iterator end{text.begin()
+            string::const_iterator end{output.begin()
                                       + static_cast<it_size_t>(endpos)};

            if (boost::regex_search(begin, end, match, re_pagebreak))
            {
-                replacement = format("<PAGE {0:s}>", match[2].str());
+                file.pages.insert({pos, match[2].str()});
            }
        }
-        else if (text.substr(pos, 7) == "<style "
-                 || text.substr(pos, 8) == "<script ")
+        else if (output.substr(pos, 7) == "<style "
+                 || output.substr(pos, 8) == "<script ")
        {
-            if (text.find("/>", pos) > endpos)
+            if (output.find("/>", pos) > endpos)
            {
-                endpos = text.find('>', endpos) + 1;
+                endpos = output.find('>', endpos) + 1;
            }
        }

-        DEBUGLOG << "Replacing '" << text.substr(pos, endpos - pos)
-                 << "' with '" << replacement << "'.";
-        text.replace(pos, endpos - pos, replacement);
-        pos += replacement.length();
+        output.erase(pos, endpos - pos);
    }

-    pos = 0;
-    while ((pos = text.find('\r', pos)) != string::npos)
-    {
-        text.erase(pos, 1);
-    }
+    file.text = output;

-    pos = 0;
-    while ((pos = text.find_first_of("\n\t", pos)) != string::npos)
-    {
-        text.replace(pos, 1, " ");
-    }
-
-    pos = 0;
-    while ((pos = text.find("  ", pos)) != string::npos)
-    {
-        text.replace(pos, 2, " ");
-    }
+    return file;
 }

 match_context context(const boost::match_results<string::const_iterator> &match,
@ -264,39 +267,36 @@ match_context context(const boost::match_results<string::const_iterator> &match,
    return {before, after};
 }

-string headline(const std::string_view prefix)
+std::string headline(const file_in_epub &file, const size_t pos)
 {
-    size_t pos{prefix.length()};
-    if ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
+    std::string_view last;
+
+    for (const auto &pair : file.headlines)
    {
-        pos += 3;
-        string result{prefix.substr(pos, prefix.find('<', pos) - pos)};
-
-        while (helpers::is_whitespace(*result.begin()))
+        if (pair.first > pos)
        {
-            result.erase(0, 1);
+            break;
        }
-        while (helpers::is_whitespace(*result.rbegin()))
-        {
-            result.erase(result.size() - 1);
-        }
-
-        return result;
+        last = pair.second;
    }

-    return {};
+    return last.data();
 }

-string page(const std::string_view prefix)
+string page(const file_in_epub &file, const size_t pos)
 {
-    size_t pos{prefix.length()};
-    while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
+    std::string_view last;
+
+    for (const auto &pair : file.pages)
    {
-        pos += 6;
-        return string{prefix.substr(pos, prefix.find('>', pos) - pos)};
+        if (pair.first > pos)
+        {
+            break;
+        }
+        last = pair.second;
    }

-    return {};
+    return last.data();
 }

 } // namespace epubgrep::search
--- a/src/search.hpp
+++ b/src/search.hpp
@ -22,7 +22,9 @@

 #include <boost/regex.hpp>

+#include <cstddef>
 #include <cstdint>
+#include <map>
 #include <string>
 #include <string_view>
 #include <utility>
@ -52,13 +54,20 @@ struct settings
    std::uint64_t context{0};
 };

+struct file_in_epub
+{
+    std::string text;
+    std::map<size_t, std::string> headlines;
+    std::map<size_t, std::string> pages;
+};
+
 //! Search file, return matches.
 [[nodiscard]] std::vector<match> search(const fs::path &filepath,
                                        std::string_view regex,
                                        const settings &opts);

 //! Strip HTML, remove newlines, condense spaces.
-void cleanup_text(std::string &text);
+[[nodiscard]] file_in_epub cleanup_text(std::string_view text);

 //! Return words before and after the match.
 [[nodiscard]] match_context
@ -66,10 +75,10 @@ context(const boost::match_results<std::string::const_iterator> &match,
        std::uint64_t words);

 //! Return last headline if possible.
-[[nodiscard]] std::string headline(std::string_view prefix);
+[[nodiscard]] std::string headline(const file_in_epub &file, size_t pos);

 //! Return current page if possible.
-[[nodiscard]] std::string page(std::string_view prefix);
+[[nodiscard]] std::string page(const file_in_epub &file, size_t pos);

 } // namespace epubgrep::search