From c790c4952ca9c9afe560788030ba1b7b9469250e Mon Sep 17 00:00:00 2001 From: tastytea Date: Mon, 24 May 2021 18:56:43 +0200 Subject: [PATCH] Extract page numbers. --- src/main.cpp | 2 +- src/search.cpp | 34 ++++++++++++++++++++++++++++++++-- src/search.hpp | 3 +++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 4e252ee..4230330 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -119,7 +119,7 @@ int main(int argc, char *argv[]) } if (!match.page.empty()) { - cout << ", " << match.page; + cout << ", page " << match.page; } cout << ": " << match.context.first << match.text << match.context.second << '\n'; diff --git a/src/search.cpp b/src/search.cpp index 527ff69..b760d9d 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -20,7 +20,6 @@ #include "zip.hpp" #include -#include #include #include @@ -73,6 +72,7 @@ std::vector search(const fs::path &filepath, std::string_view regex, std::string::const_iterator end{document.end()}; boost::match_results match_result; std::string last_headline; + std::string last_page; while (boost::regex_search(begin, end, match_result, re, boost::match_default)) @@ -87,6 +87,12 @@ std::vector search(const fs::path &filepath, std::string_view regex, last_headline = current_headline; } match.headline = last_headline; + const auto current_page{page(match_result.prefix().str())}; + if (!current_page.empty()) + { + last_page = current_page; + } + match.page = last_page; matches.emplace_back(match); begin = match_result[0].second; @@ -111,6 +117,18 @@ void cleanup_text(std::string &text) { replacement = ""; } + else if (text.substr(pos, 5) == "')}; + boost::match_results match; + const boost::regex re_pagebreak{".+pagebreak.+(title|aria-label)" + "=\"([[:alnum:]]+)\".*"}; + if (boost::regex_search(text.substr(pos, endpos).data(), match, + re_pagebreak)) + { + replacement = ""; + } + } text.replace(pos, text.find('>', pos) + 1 - pos, replacement); pos += replacement.length(); } @@ -158,7 +176,7 @@ context(const boost::match_results &match, return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)}; } -[[nodiscard]] std::string headline(const std::string_view prefix) +std::string headline(const std::string_view prefix) { size_t pos{prefix.length()}; while ((pos = prefix.rfind("", pos)) != std::string_view::npos) @@ -170,4 +188,16 @@ context(const boost::match_results &match, return {}; } +std::string page(const std::string_view prefix) +{ + size_t pos{prefix.length()}; + while ((pos = prefix.rfind("', pos) - pos)}; + } + + return {}; +} + } // namespace epubgrep::search diff --git a/src/search.hpp b/src/search.hpp index 6510bb3..579dd4a 100644 --- a/src/search.hpp +++ b/src/search.hpp @@ -73,6 +73,9 @@ context(const boost::match_results &match, //! Return last headline if possible. [[nodiscard]] std::string headline(std::string_view prefix); +//! Return current page if possible. +[[nodiscard]] std::string page(std::string_view prefix); + } // namespace epubgrep::search #endif // EPUBGREP_SEARCH_HPP