Extract page numbers.
This commit is contained in:
parent
bb4a4c719f
commit
c790c4952c
@ -119,7 +119,7 @@ int main(int argc, char *argv[])
|
||||
}
|
||||
if (!match.page.empty())
|
||||
{
|
||||
cout << ", " << match.page;
|
||||
cout << ", page " << match.page;
|
||||
}
|
||||
cout << ": " << match.context.first << match.text
|
||||
<< match.context.second << '\n';
|
||||
|
@ -20,7 +20,6 @@
|
||||
#include "zip.hpp"
|
||||
|
||||
#include <boost/regex.hpp>
|
||||
#include <boost/regex/v4/regex_match.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
@ -73,6 +72,7 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
||||
std::string::const_iterator end{document.end()};
|
||||
boost::match_results<std::string::const_iterator> match_result;
|
||||
std::string last_headline;
|
||||
std::string last_page;
|
||||
|
||||
while (boost::regex_search(begin, end, match_result, re,
|
||||
boost::match_default))
|
||||
@ -87,6 +87,12 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
||||
last_headline = current_headline;
|
||||
}
|
||||
match.headline = last_headline;
|
||||
const auto current_page{page(match_result.prefix().str())};
|
||||
if (!current_page.empty())
|
||||
{
|
||||
last_page = current_page;
|
||||
}
|
||||
match.page = last_page;
|
||||
|
||||
matches.emplace_back(match);
|
||||
begin = match_result[0].second;
|
||||
@ -111,6 +117,18 @@ void cleanup_text(std::string &text)
|
||||
{
|
||||
replacement = "</H>";
|
||||
}
|
||||
else if (text.substr(pos, 5) == "<span")
|
||||
{
|
||||
auto endpos{text.find('>')};
|
||||
boost::match_results<const char *> match;
|
||||
const boost::regex re_pagebreak{".+pagebreak.+(title|aria-label)"
|
||||
"=\"([[:alnum:]]+)\".*"};
|
||||
if (boost::regex_search(text.substr(pos, endpos).data(), match,
|
||||
re_pagebreak))
|
||||
{
|
||||
replacement = "<PAGE " + match[2] + ">";
|
||||
}
|
||||
}
|
||||
text.replace(pos, text.find('>', pos) + 1 - pos, replacement);
|
||||
pos += replacement.length();
|
||||
}
|
||||
@ -158,7 +176,7 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
||||
}
|
||||
|
||||
[[nodiscard]] std::string headline(const std::string_view prefix)
|
||||
std::string headline(const std::string_view prefix)
|
||||
{
|
||||
size_t pos{prefix.length()};
|
||||
while ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
|
||||
@ -170,4 +188,16 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string page(const std::string_view prefix)
|
||||
{
|
||||
size_t pos{prefix.length()};
|
||||
while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
|
||||
{
|
||||
pos += 6;
|
||||
return std::string{prefix.substr(pos, prefix.find('>', pos) - pos)};
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace epubgrep::search
|
||||
|
@ -73,6 +73,9 @@ context(const boost::match_results<std::string::const_iterator> &match,
|
||||
//! Return last headline if possible.
|
||||
[[nodiscard]] std::string headline(std::string_view prefix);
|
||||
|
||||
//! Return current page if possible.
|
||||
[[nodiscard]] std::string page(std::string_view prefix);
|
||||
|
||||
} // namespace epubgrep::search
|
||||
|
||||
#endif // EPUBGREP_SEARCH_HPP
|
||||
|
Loading…
x
Reference in New Issue
Block a user