Clean up text before searching.

2021-05-24 16:01:41 +02:00 · 2021-05-24 16:01:41 +02:00 · 84e2b387e5
commit 84e2b387e5
parent 1979956f03
5 changed files with 34 additions and 4 deletions
--- a/man/epubgrep.1.adoc
+++ b/man/epubgrep.1.adoc
@ -50,7 +50,7 @@ Ignore case distinctions in pattern and data.
 Use additional _PATTERN_ for matching. Can be used more than once.

 *-a*, *--raw*::
-Do not strip HTML before searching.
+Do not clean up text before searching. No HTML stripping, no newline removal.

 *-C* _NUMBER_, **context* _NUMBER_::
 Print _NUMBER_ words of context around matches.
--- a/src/main.cpp
+++ b/src/main.cpp
@ -105,7 +105,7 @@ int main(int argc, char *argv[])
                }
                if (vm.count("raw") > 0)
                {
-                    opts.nostrip = true;
+                    opts.raw = true;
                }
                opts.context = vm["context"].as<std::uint64_t>();

--- a/src/options.cpp
+++ b/src/options.cpp
@ -67,7 +67,7 @@ po::variables_map parse_options(int argc, char *argv[])
         ->value_name(translate("PATTERN"))->composing()->required(),
         translate("Use additional PATTERN for matching.").str().data())
        ("raw,a",
-         translate("Do not strip HTML before searching.").str().data())
+         translate("Do not clean up text before searching.").str().data())
        ("context,C", po::value<std::uint64_t>()
         ->value_name(translate("NUMBER"))->default_value(0),
         translate("Print NUMBER words of context around matches.").str().data())
--- a/src/search.cpp
+++ b/src/search.cpp
@ -63,6 +63,11 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
    for (const auto &entry : zip::list(filepath))
    {
        auto document{zip::read_file(filepath, entry)};
+        if (!opts.raw)
+        {
+            cleanup_text(document);
+        }
+
        std::string::const_iterator begin{document.begin()};
        std::string::const_iterator end{document.end()};
        boost::match_results<std::string::const_iterator> match_result;
@ -109,4 +114,27 @@ context(const boost::match_results<std::string::const_iterator> &match,
    return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
 }

+void cleanup_text(std::string &text)
+{
+    for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
+    {
+        text.erase(pos, text.find('>', pos) + 1 - pos);
+    }
+
+    for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
+    {
+        text.replace(pos, 1, "");
+    }
+
+    for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
+    {
+        text.replace(pos, 1, " ");
+    }
+
+    for (size_t pos{}; pos != std::string::npos; pos = text.find("  ", pos))
+    {
+        text.replace(pos, 2, " ");
+    }
+}
+
 } // namespace epubgrep::search
--- a/src/search.hpp
+++ b/src/search.hpp
@ -53,7 +53,7 @@ struct options
    regex_kind regex{regex_kind::basic};
    bool grep_like{false};
    bool ignore_case{false};
-    bool nostrip{false};
+    bool raw{false};
    std::uint64_t context{0};
 };

@ -65,6 +65,8 @@ struct options
 context(const boost::match_results<std::string::const_iterator> &match,
        std::uint64_t words);

+void cleanup_text(std::string &text);
+
 } // namespace epubgrep::search

 #endif // EPUBGREP_SEARCH_HPP