/* This file is part of epubgrep. * Copyright © 2021 tastytea * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, version 3. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "search.hpp" #include "fs-compat.hpp" #include "zip.hpp" #include #include #include #include #include namespace epubgrep::search { using std::string; std::vector search(const fs::path &filepath, std::string_view regex, const settings &opts) { boost::regex::flag_type flags{}; switch (opts.regex) { case options::regex_kind::basic: { flags = opts.grep_like ? boost::regex::grep : boost::regex::basic; break; } case options::regex_kind::extended: { flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended; break; } case options::regex_kind::perl: { flags = boost::regex::perl; break; } } if (opts.ignore_case) { flags |= boost::regex::icase; } const boost::regex re(regex.data(), flags); std::vector matches; for (const auto &entry : zip::list(filepath)) { auto document{zip::read_file(filepath, entry)}; if (!opts.raw) { cleanup_text(document); } string::const_iterator begin{document.begin()}; string::const_iterator end{document.end()}; boost::match_results match_result; string last_headline; string last_page; while (boost::regex_search(begin, end, match_result, re, boost::match_default)) { match match; // FIXME: Rename variable or struct. match.epub_filepath = filepath; match.filepath = entry; match.text = match_result[0]; match.context = context(match_result, opts.context); const auto current_headline{headline(match_result.prefix().str())}; if (!current_headline.empty()) { last_headline = current_headline; } match.headline = last_headline; const auto current_page{page(match_result.prefix().str())}; if (!current_page.empty()) { last_page = current_page; } match.page = last_page; matches.emplace_back(match); begin = match_result[0].second; } } return matches; } void cleanup_text(string &text) { size_t pos{}; while ((pos = text.find('<', pos)) != string::npos) { // Mark headlines. We need them later on. string replacement; if (boost::regex_match(text.substr(pos, 3), boost::regex{"<[hH][1-6]"})) { replacement = ""; } else if (boost::regex_match(text.substr(pos, 3), boost::regex{""; } else if (text.substr(pos, 5) == "')}; boost::match_results match; const boost::regex re_pagebreak{".+pagebreak.+(title|aria-label)" "=\"([[:alnum:]]+)\".*"}; if (boost::regex_search(text.substr(pos, endpos).data(), match, re_pagebreak)) { replacement = ""; } } text.replace(pos, text.find('>', pos) + 1 - pos, replacement); pos += replacement.length(); } pos = 0; while ((pos = text.find('\r', pos)) != string::npos) { text.erase(pos, 1); } pos = 0; while ((pos = text.find('\n', pos)) != string::npos) { text.replace(pos, 1, " "); } pos = 0; while ((pos = text.find(" ", pos)) != string::npos) { text.replace(pos, 2, " "); } } match_context context(const boost::match_results &match, std::uint64_t words) { if (words == 0) { return {}; } const auto &prefix{match.prefix().str()}; const auto &suffix{match.suffix().str()}; size_t pos_before{prefix.length()}; size_t pos_after{}; ++words; while (words != 0) { if (pos_before != 0) { pos_before = prefix.rfind(' ', pos_before); if (pos_before != string::npos) { --pos_before; } else { pos_before = 0; } } if (pos_after != string::npos) { pos_after = suffix.find(' ', pos_after); if (pos_after != string::npos) { ++pos_after; } } words -= 1; } if (pos_before != 0) { pos_before += 2; } if (pos_after != string::npos) { pos_after -= 1; } return {prefix.substr(pos_before), suffix.substr(0, pos_after)}; } string headline(const std::string_view prefix) { size_t pos{prefix.length()}; while ((pos = prefix.rfind("", pos)) != std::string_view::npos) { pos += 3; return string{prefix.substr(pos, prefix.find('<', pos) - pos)}; } return {}; } string page(const std::string_view prefix) { size_t pos{prefix.length()}; while ((pos = prefix.rfind("', pos) - pos)}; } return {}; } } // namespace epubgrep::search