From b8431019b714a5d72da9c6ac9c59321456c3a5b0 Mon Sep 17 00:00:00 2001 From: tastytea Date: Sun, 6 Jun 2021 21:26:09 +0200 Subject: [PATCH] Don't inject page numbers and headline-markers into the text. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The metadata is recorded in position → data pairs. Closes: https://schlomp.space/tastytea/epubgrep/issues/13 --- src/search.cpp | 158 ++++++++++++++++++++++++------------------------- src/search.hpp | 15 ++++- 2 files changed, 91 insertions(+), 82 deletions(-) diff --git a/src/search.cpp b/src/search.cpp index f57e52b..891caf8 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -27,6 +27,8 @@ #include #include +#include +#include #include #include #include @@ -83,18 +85,18 @@ std::vector search(const fs::path &filepath, for (const auto &entry : epub_filepaths) { DEBUGLOG << "Processing " << entry; - auto document{zip::read_file(filepath, entry)}; - if (!opts.raw) + file_in_epub file; { - cleanup_text(document); - document = helpers::unescape_html(document); + const auto document{zip::read_file(filepath, entry)}; + if (!opts.raw) + { + file = cleanup_text(helpers::unescape_html(document)); + } } - string::const_iterator begin{document.begin()}; - string::const_iterator end{document.end()}; + string::const_iterator begin{file.text.begin()}; + string::const_iterator end{file.text.end()}; boost::match_results match_result; - string last_headline; - string last_page; while (boost::regex_search(begin, end, match_result, re, boost::match_default)) @@ -104,18 +106,10 @@ std::vector search(const fs::path &filepath, match.filepath_inside = entry; match.text = match_result[0]; match.context = context(match_result, opts.context); - const auto current_headline{headline(match_result.prefix().str())}; - if (!current_headline.empty()) - { - last_headline = current_headline; - } - match.headline = last_headline; - const auto current_page{page(match_result.prefix().str())}; - if (!current_page.empty()) - { - last_page = current_page; - } - match.page = last_page; + const auto pos = static_cast( + std::distance(begin, match_result[0].begin())); + match.headline = headline(file, pos); + match.page = page(file, pos); matches.emplace_back(match); begin = match_result[0].second; @@ -125,75 +119,84 @@ std::vector search(const fs::path &filepath, return matches; } -void cleanup_text(string &text) +file_in_epub cleanup_text(const std::string_view text) { + string output{text}; static const boost::regex re_header_start{"<[hH][1-6]"}; static const boost::regex re_header_end{"]+pagebreak[^>]+" "(title|aria-label)" "=\"([[:alnum:]]+)\""}; - size_t pos{}; - while ((pos = text.find('<', pos)) != string::npos) + // TODO: Make this more efficient, 3 → 1; + size_t pos{0}; + while ((pos = output.find('\r', pos)) != string::npos) { - auto endpos{text.find('>', pos) + 1}; + output.erase(pos, 1); + } - // Mark headlines. We need them later on. - string replacement; - if (boost::regex_match(text.substr(pos, 3), re_header_start)) + pos = 0; + while ((pos = output.find_first_of("\n\t", pos)) != string::npos) + { + output.replace(pos, 1, " "); + } + + pos = 0; + while ((pos = output.find(" ", pos)) != string::npos) + { + output.replace(pos, 2, " "); + } + + pos = 0; + file_in_epub file; + size_t headline_start{string::npos}; + while ((pos = output.find('<', pos)) != string::npos) + { + auto endpos{output.find('>', pos) + 1}; + + if (boost::regex_match(output.substr(pos, 3), re_header_start)) { - replacement = ""; + headline_start = pos; } - else if (boost::regex_match(text.substr(pos, 4), re_header_end)) + else if (boost::regex_match(output.substr(pos, 4), re_header_end)) { - replacement = ""; + if (headline_start != string::npos) + { + file.headlines.insert( + {headline_start, + output.substr(headline_start, pos - headline_start)}); + headline_start = string::npos; + } } - else if (text.substr(pos, 6) == " match; using it_size_t = string::const_iterator::difference_type; - string::const_iterator begin{text.begin() + string::const_iterator begin{output.begin() + static_cast(pos)}; - string::const_iterator end{text.begin() + string::const_iterator end{output.begin() + static_cast(endpos)}; if (boost::regex_search(begin, end, match, re_pagebreak)) { - replacement = format("", match[2].str()); + file.pages.insert({pos, match[2].str()}); } } - else if (text.substr(pos, 7) == "