epubgrep/src/search.cpp

298 lines
8.5 KiB
C++

/* This file is part of epubgrep.
* Copyright © 2021 tastytea <tastytea@tastytea.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "search.hpp"
#include "fs-compat.hpp"
#include "helpers.hpp"
#include "log.hpp"
#include "zip.hpp"
#include <boost/regex.hpp>
#include <fmt/format.h>
#include <fmt/ostream.h> // For compatibility with fmt 4.
#include <algorithm>
#include <array>
#include <string>
#include <string_view>
#include <vector>
namespace epubgrep::search
{
using fmt::format;
using std::string;
std::vector<match> search(const fs::path &filepath,
const std::string_view regex, const settings &opts)
{
LOG(log::sev::info)
<< format(R"(Starting search in {0:s} using regex "{1:s}")", filepath,
regex);
boost::regex::flag_type flags{};
switch (opts.regex)
{
case options::regex_kind::basic:
{
flags = opts.grep_like ? boost::regex::grep : boost::regex::basic;
break;
}
case options::regex_kind::extended:
{
flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended;
break;
}
case options::regex_kind::perl:
{
flags = boost::regex::perl;
break;
}
}
if (opts.ignore_case)
{
flags |= boost::regex::icase;
}
const boost::regex re(regex.data(), flags);
std::vector<match> matches;
std::vector<string> epub_filepaths{[&opts, &filepath]
{
if (!opts.raw)
{
return zip::list_spine(filepath);
}
return zip::list(filepath);
}()};
for (const auto &entry : epub_filepaths)
{
DEBUGLOG << "Processing " << entry;
auto document{zip::read_file(filepath, entry)};
if (!opts.raw)
{
cleanup_text(document);
document = helpers::unescape_html(document);
}
string::const_iterator begin{document.begin()};
string::const_iterator end{document.end()};
boost::match_results<string::const_iterator> match_result;
string last_headline;
string last_page;
while (boost::regex_search(begin, end, match_result, re,
boost::match_default))
{
match match; // FIXME: Rename variable or struct.
match.filepath_epub = filepath;
match.filepath_inside = entry;
match.text = match_result[0];
match.context = context(match_result, opts.context);
const auto current_headline{headline(match_result.prefix().str())};
if (!current_headline.empty())
{
last_headline = current_headline;
}
match.headline = last_headline;
const auto current_page{page(match_result.prefix().str())};
if (!current_page.empty())
{
last_page = current_page;
}
match.page = last_page;
matches.emplace_back(match);
begin = match_result[0].second;
}
}
return matches;
}
void cleanup_text(string &text)
{
static const boost::regex re_header_start{"<[hH][1-6]"};
static const boost::regex re_header_end{"</[hH][1-6]"};
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
"(title|aria-label)"
"=\"([[:alnum:]]+)\""};
size_t pos{};
while ((pos = text.find('<', pos)) != string::npos)
{
auto endpos{text.find('>', pos) + 1};
// Mark headlines. We need them later on.
string replacement;
if (boost::regex_match(text.substr(pos, 3), re_header_start))
{
replacement = "<H>";
}
else if (boost::regex_match(text.substr(pos, 4), re_header_end))
{
replacement = "</H>";
}
else if (text.substr(pos, 6) == "<span ")
{
boost::match_results<string::const_iterator> match;
using it_size_t = string::const_iterator::difference_type;
string::const_iterator begin{text.begin()
+ static_cast<it_size_t>(pos)};
string::const_iterator end{text.begin()
+ static_cast<it_size_t>(endpos)};
if (boost::regex_search(begin, end, match, re_pagebreak))
{
replacement = format("<PAGE {0:s}>", match[2].str());
}
}
else if (text.substr(pos, 7) == "<style "
|| text.substr(pos, 8) == "<script ")
{
pos = text.find('>', pos) + 1;
}
text.replace(pos, endpos - pos, replacement);
pos += replacement.length();
}
pos = 0;
while ((pos = text.find('\r', pos)) != string::npos)
{
text.erase(pos, 1);
}
pos = 0;
while ((pos = text.find_first_of("\n\t", pos)) != string::npos)
{
text.replace(pos, 1, " ");
}
pos = 0;
while ((pos = text.find(" ", pos)) != string::npos)
{
text.replace(pos, 2, " ");
}
}
match_context context(const boost::match_results<string::const_iterator> &match,
std::uint64_t words)
{
if (words == 0)
{
return {};
}
++words;
const auto &rbegin_before{std::reverse_iterator(match.prefix().end())};
const auto &rend_before{std::reverse_iterator(match.prefix().begin())};
const auto &begin_after{match.suffix().begin()};
const auto &end_after{match.suffix().end()};
auto pos_before{rbegin_before};
auto pos_after{begin_after};
const std::array<char, 4> whitespace{' ', '\n', '\r', '\t'};
while (words != 0)
{
if (pos_before != rend_before)
{
pos_before = std::find_first_of(pos_before, rend_before,
whitespace.begin(),
whitespace.end());
if (pos_before != rend_before)
{
while (helpers::is_whitespace(*pos_before))
{
++pos_before;
}
}
}
if (pos_after != end_after)
{
pos_after = std::find_first_of(pos_after, end_after,
whitespace.begin(),
whitespace.end());
if (pos_after != end_after)
{
while (helpers::is_whitespace(*pos_after))
{
++pos_after;
}
}
}
words -= 1;
}
const string before_reversed(rbegin_before, pos_before);
string before(before_reversed.rbegin(), before_reversed.rend());
string after(begin_after, pos_after);
while (helpers::is_whitespace(*before.begin()))
{
before.erase(0, 1);
}
while (helpers::is_whitespace(*after.rbegin()))
{
after.erase(after.size() - 1);
}
return {before, after};
}
string headline(const std::string_view prefix)
{
size_t pos{prefix.length()};
if ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
{
pos += 3;
string result{prefix.substr(pos, prefix.find('<', pos) - pos)};
while (helpers::is_whitespace(*result.begin()))
{
result.erase(0, 1);
}
while (helpers::is_whitespace(*result.rbegin()))
{
result.erase(result.size() - 1);
}
return result;
}
return {};
}
string page(const std::string_view prefix)
{
size_t pos{prefix.length()};
while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
{
pos += 6;
return string{prefix.substr(pos, prefix.find('>', pos) - pos)};
}
return {};
}
} // namespace epubgrep::search