Clean up text before searching.
This commit is contained in:
parent
1979956f03
commit
84e2b387e5
|
@ -50,7 +50,7 @@ Ignore case distinctions in pattern and data.
|
||||||
Use additional _PATTERN_ for matching. Can be used more than once.
|
Use additional _PATTERN_ for matching. Can be used more than once.
|
||||||
|
|
||||||
*-a*, *--raw*::
|
*-a*, *--raw*::
|
||||||
Do not strip HTML before searching.
|
Do not clean up text before searching. No HTML stripping, no newline removal.
|
||||||
|
|
||||||
*-C* _NUMBER_, **context* _NUMBER_::
|
*-C* _NUMBER_, **context* _NUMBER_::
|
||||||
Print _NUMBER_ words of context around matches.
|
Print _NUMBER_ words of context around matches.
|
||||||
|
|
|
@ -105,7 +105,7 @@ int main(int argc, char *argv[])
|
||||||
}
|
}
|
||||||
if (vm.count("raw") > 0)
|
if (vm.count("raw") > 0)
|
||||||
{
|
{
|
||||||
opts.nostrip = true;
|
opts.raw = true;
|
||||||
}
|
}
|
||||||
opts.context = vm["context"].as<std::uint64_t>();
|
opts.context = vm["context"].as<std::uint64_t>();
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,7 @@ po::variables_map parse_options(int argc, char *argv[])
|
||||||
->value_name(translate("PATTERN"))->composing()->required(),
|
->value_name(translate("PATTERN"))->composing()->required(),
|
||||||
translate("Use additional PATTERN for matching.").str().data())
|
translate("Use additional PATTERN for matching.").str().data())
|
||||||
("raw,a",
|
("raw,a",
|
||||||
translate("Do not strip HTML before searching.").str().data())
|
translate("Do not clean up text before searching.").str().data())
|
||||||
("context,C", po::value<std::uint64_t>()
|
("context,C", po::value<std::uint64_t>()
|
||||||
->value_name(translate("NUMBER"))->default_value(0),
|
->value_name(translate("NUMBER"))->default_value(0),
|
||||||
translate("Print NUMBER words of context around matches.").str().data())
|
translate("Print NUMBER words of context around matches.").str().data())
|
||||||
|
|
|
@ -63,6 +63,11 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
||||||
for (const auto &entry : zip::list(filepath))
|
for (const auto &entry : zip::list(filepath))
|
||||||
{
|
{
|
||||||
auto document{zip::read_file(filepath, entry)};
|
auto document{zip::read_file(filepath, entry)};
|
||||||
|
if (!opts.raw)
|
||||||
|
{
|
||||||
|
cleanup_text(document);
|
||||||
|
}
|
||||||
|
|
||||||
std::string::const_iterator begin{document.begin()};
|
std::string::const_iterator begin{document.begin()};
|
||||||
std::string::const_iterator end{document.end()};
|
std::string::const_iterator end{document.end()};
|
||||||
boost::match_results<std::string::const_iterator> match_result;
|
boost::match_results<std::string::const_iterator> match_result;
|
||||||
|
@ -109,4 +114,27 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||||
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void cleanup_text(std::string &text)
|
||||||
|
{
|
||||||
|
for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
|
||||||
|
{
|
||||||
|
text.erase(pos, text.find('>', pos) + 1 - pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
|
||||||
|
{
|
||||||
|
text.replace(pos, 1, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
|
||||||
|
{
|
||||||
|
text.replace(pos, 1, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t pos{}; pos != std::string::npos; pos = text.find(" ", pos))
|
||||||
|
{
|
||||||
|
text.replace(pos, 2, " ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace epubgrep::search
|
} // namespace epubgrep::search
|
||||||
|
|
|
@ -53,7 +53,7 @@ struct options
|
||||||
regex_kind regex{regex_kind::basic};
|
regex_kind regex{regex_kind::basic};
|
||||||
bool grep_like{false};
|
bool grep_like{false};
|
||||||
bool ignore_case{false};
|
bool ignore_case{false};
|
||||||
bool nostrip{false};
|
bool raw{false};
|
||||||
std::uint64_t context{0};
|
std::uint64_t context{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -65,6 +65,8 @@ struct options
|
||||||
context(const boost::match_results<std::string::const_iterator> &match,
|
context(const boost::match_results<std::string::const_iterator> &match,
|
||||||
std::uint64_t words);
|
std::uint64_t words);
|
||||||
|
|
||||||
|
void cleanup_text(std::string &text);
|
||||||
|
|
||||||
} // namespace epubgrep::search
|
} // namespace epubgrep::search
|
||||||
|
|
||||||
#endif // EPUBGREP_SEARCH_HPP
|
#endif // EPUBGREP_SEARCH_HPP
|
||||||
|
|
Loading…
Reference in New Issue