Extract page numbers.

This commit is contained in:
tastytea 2021-05-24 18:56:43 +02:00
parent bb4a4c719f
commit c790c4952c
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
3 changed files with 36 additions and 3 deletions

View File

@ -119,7 +119,7 @@ int main(int argc, char *argv[])
}
if (!match.page.empty())
{
cout << ", " << match.page;
cout << ", page " << match.page;
}
cout << ": " << match.context.first << match.text
<< match.context.second << '\n';

View File

@ -20,7 +20,6 @@
#include "zip.hpp"
#include <boost/regex.hpp>
#include <boost/regex/v4/regex_match.hpp>
#include <algorithm>
#include <string>
@ -73,6 +72,7 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
std::string::const_iterator end{document.end()};
boost::match_results<std::string::const_iterator> match_result;
std::string last_headline;
std::string last_page;
while (boost::regex_search(begin, end, match_result, re,
boost::match_default))
@ -87,6 +87,12 @@ std::vector<match> search(const fs::path &filepath, std::string_view regex,
last_headline = current_headline;
}
match.headline = last_headline;
const auto current_page{page(match_result.prefix().str())};
if (!current_page.empty())
{
last_page = current_page;
}
match.page = last_page;
matches.emplace_back(match);
begin = match_result[0].second;
@ -111,6 +117,18 @@ void cleanup_text(std::string &text)
{
replacement = "</H>";
}
else if (text.substr(pos, 5) == "<span")
{
auto endpos{text.find('>')};
boost::match_results<const char *> match;
const boost::regex re_pagebreak{".+pagebreak.+(title|aria-label)"
"=\"([[:alnum:]]+)\".*"};
if (boost::regex_search(text.substr(pos, endpos).data(), match,
re_pagebreak))
{
replacement = "<PAGE " + match[2] + ">";
}
}
text.replace(pos, text.find('>', pos) + 1 - pos, replacement);
pos += replacement.length();
}
@ -158,7 +176,7 @@ context(const boost::match_results<std::string::const_iterator> &match,
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
}
[[nodiscard]] std::string headline(const std::string_view prefix)
std::string headline(const std::string_view prefix)
{
size_t pos{prefix.length()};
while ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
@ -170,4 +188,16 @@ context(const boost::match_results<std::string::const_iterator> &match,
return {};
}
std::string page(const std::string_view prefix)
{
size_t pos{prefix.length()};
while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
{
pos += 6;
return std::string{prefix.substr(pos, prefix.find('>', pos) - pos)};
}
return {};
}
} // namespace epubgrep::search

View File

@ -73,6 +73,9 @@ context(const boost::match_results<std::string::const_iterator> &match,
//! Return last headline if possible.
[[nodiscard]] std::string headline(std::string_view prefix);
//! Return current page if possible.
[[nodiscard]] std::string page(std::string_view prefix);
} // namespace epubgrep::search
#endif // EPUBGREP_SEARCH_HPP