Clean up text before searching.
This commit is contained in:
parent
1979956f03
commit
84e2b387e5
|
@ -50,7 +50,7 @@ Ignore case distinctions in pattern and data.
|
|||
Use additional _PATTERN_ for matching. Can be used more than once.
|
||||
|
||||
*-a*, *--raw*::
|
||||
Do not strip HTML before searching.
|
||||
Do not clean up text before searching. No HTML stripping, no newline removal.
|
||||
|
||||
*-C* _NUMBER_, **context* _NUMBER_::
|
||||
Print _NUMBER_ words of context around matches.
|
||||
|
|
|
@ -105,7 +105,7 @@ int main(int argc, char *argv[])
|
|||
}
|
||||
if (vm.count("raw") > 0)
|
||||
{
|
||||
opts.nostrip = true;
|
||||
opts.raw = true;
|
||||
}
|
||||
opts.context = vm["context"].as<std::uint64_t>();
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ po::variables_map parse_options(int argc, char *argv[])
|
|||
->value_name(translate("PATTERN"))->composing()->required(),
|
||||
translate("Use additional PATTERN for matching.").str().data())
|
||||
("raw,a",
|
||||
translate("Do not strip HTML before searching.").str().data())
|
||||
translate("Do not clean up text before searching.").str().data())
|
||||
("context,C", po::value<std::uint64_t>()
|
||||
->value_name(translate("NUMBER"))->default_value(0),
|
||||
translate("Print NUMBER words of context around matches.").str().data())
|
||||
|
|
|
@ -63,6 +63,11 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
|||
for (const auto &entry : zip::list(filepath))
|
||||
{
|
||||
auto document{zip::read_file(filepath, entry)};
|
||||
if (!opts.raw)
|
||||
{
|
||||
cleanup_text(document);
|
||||
}
|
||||
|
||||
std::string::const_iterator begin{document.begin()};
|
||||
std::string::const_iterator end{document.end()};
|
||||
boost::match_results<std::string::const_iterator> match_result;
|
||||
|
@ -109,4 +114,27 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
|||
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
||||
}
|
||||
|
||||
void cleanup_text(std::string &text)
|
||||
{
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
|
||||
{
|
||||
text.erase(pos, text.find('>', pos) + 1 - pos);
|
||||
}
|
||||
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
|
||||
{
|
||||
text.replace(pos, 1, "");
|
||||
}
|
||||
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
|
||||
{
|
||||
text.replace(pos, 1, " ");
|
||||
}
|
||||
|
||||
for (size_t pos{}; pos != std::string::npos; pos = text.find(" ", pos))
|
||||
{
|
||||
text.replace(pos, 2, " ");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace epubgrep::search
|
||||
|
|
|
@ -53,7 +53,7 @@ struct options
|
|||
regex_kind regex{regex_kind::basic};
|
||||
bool grep_like{false};
|
||||
bool ignore_case{false};
|
||||
bool nostrip{false};
|
||||
bool raw{false};
|
||||
std::uint64_t context{0};
|
||||
};
|
||||
|
||||
|
@ -65,6 +65,8 @@ struct options
|
|||
context(const boost::match_results<std::string::const_iterator> &match,
|
||||
std::uint64_t words);
|
||||
|
||||
void cleanup_text(std::string &text);
|
||||
|
||||
} // namespace epubgrep::search
|
||||
|
||||
#endif // EPUBGREP_SEARCH_HPP
|
||||
|
|
Loading…
Reference in New Issue
Block a user