/* This file is part of epubgrep. * Copyright © 2021 tastytea * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, version 3. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "search.hpp" #include "fs-compat.hpp" #include "helpers.hpp" #include "log.hpp" #include "zip.hpp" #include #include #include // For compatibility with fmt 4. #include #include #include #include #include #include #include namespace epubgrep::search { using fmt::format; using std::string; std::vector search(const fs::path &filepath, const std::string_view regex, const settings &opts) { LOG(log::sev::info) << format(R"(Starting search in {0:s} using regex "{1:s}")", filepath, regex); boost::regex::flag_type flags{}; switch (opts.regex) { case options::regex_kind::basic: { flags = opts.grep_like ? boost::regex::grep : boost::regex::basic; break; } case options::regex_kind::extended: { flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended; break; } case options::regex_kind::perl: { flags = boost::regex::perl; break; } } if (opts.ignore_case) { flags |= boost::regex::icase; } const boost::regex re(regex.data(), flags); std::vector matches; std::vector epub_filepaths{[&opts, &filepath] { if (!opts.raw) { return zip::list_spine(filepath); } return zip::list(filepath); }()}; for (const auto &entry : epub_filepaths) { DEBUGLOG << "Processing " << entry; file_in_epub file; { const auto document{zip::read_file(filepath, entry)}; if (!opts.raw) { file = cleanup_text(helpers::unescape_html(document)); } else { file.text = document; } } string::const_iterator begin{file.text.begin()}; string::const_iterator end{file.text.end()}; auto begin_text{begin}; boost::match_results match_result; while (boost::regex_search(begin, end, match_result, re, boost::match_default)) { match match; // FIXME: Rename variable or struct. match.filepath_epub = filepath; match.filepath_inside = entry; match.text = match_result[0]; match.context = context(match_result, opts.context); const auto pos = static_cast( std::distance(begin_text, match_result[0].begin())); match.headline = headline(file, pos); match.page = page(file, pos); matches.emplace_back(match); begin = match_result[0].end(); } } return matches; } file_in_epub cleanup_text(const std::string_view text) { string output{text}; static const boost::regex re_header_start{"<[hH][1-6]"}; static const boost::regex re_header_end{"]+pagebreak[^>]+" "(title|aria-label)" "=\"([[:alnum:]]+)\""}; // TODO: Make this more efficient, 3 → 1; size_t pos{0}; while ((pos = output.find('\r', pos)) != string::npos) { output.erase(pos, 1); } pos = 0; while ((pos = output.find_first_of("\n\t", pos)) != string::npos) { output.replace(pos, 1, " "); } pos = 0; while ((pos = output.find(" ", pos)) != string::npos) { output.replace(pos, 2, " "); } pos = 0; file_in_epub file; size_t headline_start{string::npos}; while ((pos = output.find('<', pos)) != string::npos) { auto endpos{output.find('>', pos) + 1}; if (boost::regex_match(output.substr(pos, 3), re_header_start)) { headline_start = pos; } else if (boost::regex_match(output.substr(pos, 4), re_header_end)) { if (headline_start != string::npos) { file.headlines.insert( {headline_start, output.substr(headline_start, pos - headline_start)}); headline_start = string::npos; } } else if (output.substr(pos, 6) == " match; using it_size_t = string::const_iterator::difference_type; string::const_iterator begin{output.begin() + static_cast(pos)}; string::const_iterator end{output.begin() + static_cast(endpos)}; if (boost::regex_search(begin, end, match, re_pagebreak)) { file.pages.insert({pos, match[2].str()}); } } else if (output.substr(pos, 7) == "