Extract headlines.

This commit is contained in:
tastytea 2021-05-24 17:18:10 +02:00
parent 8b21f4a8b9
commit 8ab7d0f655
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
2 changed files with 29 additions and 0 deletions

View File

@ -71,6 +71,8 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
std::string::const_iterator begin{document.begin()}; std::string::const_iterator begin{document.begin()};
std::string::const_iterator end{document.end()}; std::string::const_iterator end{document.end()};
boost::match_results<std::string::const_iterator> match_result; boost::match_results<std::string::const_iterator> match_result;
std::string last_headline;
while (boost::regex_search(begin, end, match_result, re, while (boost::regex_search(begin, end, match_result, re,
boost::match_default)) boost::match_default))
{ {
@ -78,6 +80,12 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
match.filepath = entry; match.filepath = entry;
match.text = match_result[0]; match.text = match_result[0];
match.context = context(match_result, opts.context); match.context = context(match_result, opts.context);
const auto current_headline{headline(match_result.prefix().str())};
if (!current_headline.empty())
{
last_headline = current_headline;
}
match.headline = last_headline;
matches.emplace_back(match); matches.emplace_back(match);
begin = match_result[0].second; begin = match_result[0].second;
@ -94,6 +102,7 @@ void cleanup_text(std::string &text)
// Don't strip headlines. We need them later on. // Don't strip headlines. We need them later on.
if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h") if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
{ {
++pos;
continue; continue;
} }
text.erase(pos, text.find('>', pos) + 1 - pos); text.erase(pos, text.find('>', pos) + 1 - pos);
@ -142,4 +151,21 @@ context(const boost::match_results<std::string::const_iterator> &match,
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)}; return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
} }
[[nodiscard]] std::string headline(const std::string_view prefix)
{
size_t pos{prefix.length()};
while ((pos = prefix.rfind("<h", pos)) != std::string_view::npos)
{
if (boost::regex_match(std::string{prefix.substr(pos, 4)},
boost::regex{"<h[1-6][> ]"}))
{
pos = prefix.find('>', pos) + 1;
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
}
pos -= 2;
}
return {};
}
} // namespace epubgrep::search } // namespace epubgrep::search

View File

@ -70,6 +70,9 @@ void cleanup_text(std::string &text);
context(const boost::match_results<std::string::const_iterator> &match, context(const boost::match_results<std::string::const_iterator> &match,
std::uint64_t words); std::uint64_t words);
//! Return last headline if possible.
[[nodiscard]] std::string headline(std::string_view prefix);
} // namespace epubgrep::search } // namespace epubgrep::search
#endif // EPUBGREP_SEARCH_HPP #endif // EPUBGREP_SEARCH_HPP