Extract headlines.
This commit is contained in:
parent
8b21f4a8b9
commit
8ab7d0f655
|
@ -71,6 +71,8 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
|||
std::string::const_iterator begin{document.begin()};
|
||||
std::string::const_iterator end{document.end()};
|
||||
boost::match_results<std::string::const_iterator> match_result;
|
||||
std::string last_headline;
|
||||
|
||||
while (boost::regex_search(begin, end, match_result, re,
|
||||
boost::match_default))
|
||||
{
|
||||
|
@ -78,6 +80,12 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
|||
match.filepath = entry;
|
||||
match.text = match_result[0];
|
||||
match.context = context(match_result, opts.context);
|
||||
const auto current_headline{headline(match_result.prefix().str())};
|
||||
if (!current_headline.empty())
|
||||
{
|
||||
last_headline = current_headline;
|
||||
}
|
||||
match.headline = last_headline;
|
||||
|
||||
matches.emplace_back(match);
|
||||
begin = match_result[0].second;
|
||||
|
@ -94,6 +102,7 @@ void cleanup_text(std::string &text)
|
|||
// Don't strip headlines. We need them later on.
|
||||
if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
|
||||
{
|
||||
++pos;
|
||||
continue;
|
||||
}
|
||||
text.erase(pos, text.find('>', pos) + 1 - pos);
|
||||
|
@ -142,4 +151,21 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
|||
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
||||
}
|
||||
|
||||
[[nodiscard]] std::string headline(const std::string_view prefix)
|
||||
{
|
||||
size_t pos{prefix.length()};
|
||||
while ((pos = prefix.rfind("<h", pos)) != std::string_view::npos)
|
||||
{
|
||||
if (boost::regex_match(std::string{prefix.substr(pos, 4)},
|
||||
boost::regex{"<h[1-6][> ]"}))
|
||||
{
|
||||
pos = prefix.find('>', pos) + 1;
|
||||
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
|
||||
}
|
||||
pos -= 2;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace epubgrep::search
|
||||
|
|
|
@ -70,6 +70,9 @@ void cleanup_text(std::string &text);
|
|||
context(const boost::match_results<std::string::const_iterator> &match,
|
||||
std::uint64_t words);
|
||||
|
||||
//! Return last headline if possible.
|
||||
[[nodiscard]] std::string headline(std::string_view prefix);
|
||||
|
||||
} // namespace epubgrep::search
|
||||
|
||||
#endif // EPUBGREP_SEARCH_HPP
|
||||
|
|
Loading…
Reference in New Issue
Block a user