Clean up text before searching.

This commit is contained in:
tastytea 2021-05-24 16:01:41 +02:00
parent 1979956f03
commit 84e2b387e5
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
5 changed files with 34 additions and 4 deletions

View File

@ -50,7 +50,7 @@ Ignore case distinctions in pattern and data.
Use additional _PATTERN_ for matching. Can be used more than once.
*-a*, *--raw*::
Do not strip HTML before searching.
Do not clean up text before searching. No HTML stripping, no newline removal.
*-C* _NUMBER_, **context* _NUMBER_::
Print _NUMBER_ words of context around matches.

View File

@ -105,7 +105,7 @@ int main(int argc, char *argv[])
}
if (vm.count("raw") > 0)
{
opts.nostrip = true;
opts.raw = true;
}
opts.context = vm["context"].as<std::uint64_t>();

View File

@ -67,7 +67,7 @@ po::variables_map parse_options(int argc, char *argv[])
->value_name(translate("PATTERN"))->composing()->required(),
translate("Use additional PATTERN for matching.").str().data())
("raw,a",
translate("Do not strip HTML before searching.").str().data())
translate("Do not clean up text before searching.").str().data())
("context,C", po::value<std::uint64_t>()
->value_name(translate("NUMBER"))->default_value(0),
translate("Print NUMBER words of context around matches.").str().data())

View File

@ -63,6 +63,11 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
for (const auto &entry : zip::list(filepath))
{
auto document{zip::read_file(filepath, entry)};
if (!opts.raw)
{
cleanup_text(document);
}
std::string::const_iterator begin{document.begin()};
std::string::const_iterator end{document.end()};
boost::match_results<std::string::const_iterator> match_result;
@ -109,4 +114,27 @@ context(const boost::match_results<std::string::const_iterator> &match,
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
}
void cleanup_text(std::string &text)
{
for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
{
text.erase(pos, text.find('>', pos) + 1 - pos);
}
for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
{
text.replace(pos, 1, "");
}
for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
{
text.replace(pos, 1, " ");
}
for (size_t pos{}; pos != std::string::npos; pos = text.find(" ", pos))
{
text.replace(pos, 2, " ");
}
}
} // namespace epubgrep::search

View File

@ -53,7 +53,7 @@ struct options
regex_kind regex{regex_kind::basic};
bool grep_like{false};
bool ignore_case{false};
bool nostrip{false};
bool raw{false};
std::uint64_t context{0};
};
@ -65,6 +65,8 @@ struct options
context(const boost::match_results<std::string::const_iterator> &match,
std::uint64_t words);
void cleanup_text(std::string &text);
} // namespace epubgrep::search
#endif // EPUBGREP_SEARCH_HPP