diff --git a/man/epubgrep.1.adoc b/man/epubgrep.1.adoc index 5b36cc2..41f8c08 100644 --- a/man/epubgrep.1.adoc +++ b/man/epubgrep.1.adoc @@ -50,7 +50,7 @@ Ignore case distinctions in pattern and data. Use additional _PATTERN_ for matching. Can be used more than once. *-a*, *--raw*:: -Do not strip HTML before searching. +Do not clean up text before searching. No HTML stripping, no newline removal. *-C* _NUMBER_, **context* _NUMBER_:: Print _NUMBER_ words of context around matches. diff --git a/src/main.cpp b/src/main.cpp index d47bd4a..f937f76 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -105,7 +105,7 @@ int main(int argc, char *argv[]) } if (vm.count("raw") > 0) { - opts.nostrip = true; + opts.raw = true; } opts.context = vm["context"].as(); diff --git a/src/options.cpp b/src/options.cpp index f204296..087f3e4 100644 --- a/src/options.cpp +++ b/src/options.cpp @@ -67,7 +67,7 @@ po::variables_map parse_options(int argc, char *argv[]) ->value_name(translate("PATTERN"))->composing()->required(), translate("Use additional PATTERN for matching.").str().data()) ("raw,a", - translate("Do not strip HTML before searching.").str().data()) + translate("Do not clean up text before searching.").str().data()) ("context,C", po::value() ->value_name(translate("NUMBER"))->default_value(0), translate("Print NUMBER words of context around matches.").str().data()) diff --git a/src/search.cpp b/src/search.cpp index af08d27..eb2d11d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -63,6 +63,11 @@ std::vector search(const fs::path &filepath, std::string_view regex, for (const auto &entry : zip::list(filepath)) { auto document{zip::read_file(filepath, entry)}; + if (!opts.raw) + { + cleanup_text(document); + } + std::string::const_iterator begin{document.begin()}; std::string::const_iterator end{document.end()}; boost::match_results match_result; @@ -109,4 +114,27 @@ context(const boost::match_results &match, return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)}; } +void cleanup_text(std::string &text) +{ + for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos)) + { + text.erase(pos, text.find('>', pos) + 1 - pos); + } + + for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos)) + { + text.replace(pos, 1, ""); + } + + for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos)) + { + text.replace(pos, 1, " "); + } + + for (size_t pos{}; pos != std::string::npos; pos = text.find(" ", pos)) + { + text.replace(pos, 2, " "); + } +} + } // namespace epubgrep::search diff --git a/src/search.hpp b/src/search.hpp index 08e7620..21e198e 100644 --- a/src/search.hpp +++ b/src/search.hpp @@ -53,7 +53,7 @@ struct options regex_kind regex{regex_kind::basic}; bool grep_like{false}; bool ignore_case{false}; - bool nostrip{false}; + bool raw{false}; std::uint64_t context{0}; }; @@ -65,6 +65,8 @@ struct options context(const boost::match_results &match, std::uint64_t words); +void cleanup_text(std::string &text); + } // namespace epubgrep::search #endif // EPUBGREP_SEARCH_HPP