Extract headlines.
This commit is contained in:
parent
8b21f4a8b9
commit
8ab7d0f655
|
@ -71,6 +71,8 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
||||||
std::string::const_iterator begin{document.begin()};
|
std::string::const_iterator begin{document.begin()};
|
||||||
std::string::const_iterator end{document.end()};
|
std::string::const_iterator end{document.end()};
|
||||||
boost::match_results<std::string::const_iterator> match_result;
|
boost::match_results<std::string::const_iterator> match_result;
|
||||||
|
std::string last_headline;
|
||||||
|
|
||||||
while (boost::regex_search(begin, end, match_result, re,
|
while (boost::regex_search(begin, end, match_result, re,
|
||||||
boost::match_default))
|
boost::match_default))
|
||||||
{
|
{
|
||||||
|
@ -78,6 +80,12 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
||||||
match.filepath = entry;
|
match.filepath = entry;
|
||||||
match.text = match_result[0];
|
match.text = match_result[0];
|
||||||
match.context = context(match_result, opts.context);
|
match.context = context(match_result, opts.context);
|
||||||
|
const auto current_headline{headline(match_result.prefix().str())};
|
||||||
|
if (!current_headline.empty())
|
||||||
|
{
|
||||||
|
last_headline = current_headline;
|
||||||
|
}
|
||||||
|
match.headline = last_headline;
|
||||||
|
|
||||||
matches.emplace_back(match);
|
matches.emplace_back(match);
|
||||||
begin = match_result[0].second;
|
begin = match_result[0].second;
|
||||||
|
@ -94,6 +102,7 @@ void cleanup_text(std::string &text)
|
||||||
// Don't strip headlines. We need them later on.
|
// Don't strip headlines. We need them later on.
|
||||||
if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
|
if (text[pos + 1] == 'h' || text.substr(pos + 1, 2) == "/h")
|
||||||
{
|
{
|
||||||
|
++pos;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
text.erase(pos, text.find('>', pos) + 1 - pos);
|
text.erase(pos, text.find('>', pos) + 1 - pos);
|
||||||
|
@ -142,4 +151,21 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||||
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] std::string headline(const std::string_view prefix)
|
||||||
|
{
|
||||||
|
size_t pos{prefix.length()};
|
||||||
|
while ((pos = prefix.rfind("<h", pos)) != std::string_view::npos)
|
||||||
|
{
|
||||||
|
if (boost::regex_match(std::string{prefix.substr(pos, 4)},
|
||||||
|
boost::regex{"<h[1-6][> ]"}))
|
||||||
|
{
|
||||||
|
pos = prefix.find('>', pos) + 1;
|
||||||
|
return std::string{prefix.substr(pos, prefix.find('<', pos) - pos)};
|
||||||
|
}
|
||||||
|
pos -= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace epubgrep::search
|
} // namespace epubgrep::search
|
||||||
|
|
|
@ -70,6 +70,9 @@ void cleanup_text(std::string &text);
|
||||||
context(const boost::match_results<std::string::const_iterator> &match,
|
context(const boost::match_results<std::string::const_iterator> &match,
|
||||||
std::uint64_t words);
|
std::uint64_t words);
|
||||||
|
|
||||||
|
//! Return last headline if possible.
|
||||||
|
[[nodiscard]] std::string headline(std::string_view prefix);
|
||||||
|
|
||||||
} // namespace epubgrep::search
|
} // namespace epubgrep::search
|
||||||
|
|
||||||
#endif // EPUBGREP_SEARCH_HPP
|
#endif // EPUBGREP_SEARCH_HPP
|
||||||
|
|
Loading…
Reference in New Issue
Block a user