2021-05-24 07:52:36 +02:00
|
|
|
/* This file is part of epubgrep.
|
|
|
|
* Copyright © 2021 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "search.hpp"
|
|
|
|
|
|
|
|
#include "fs-compat.hpp"
|
2021-05-30 21:52:52 +02:00
|
|
|
#include "helpers.hpp"
|
2021-06-01 15:32:10 +02:00
|
|
|
#include "log.hpp"
|
2021-05-24 15:35:49 +02:00
|
|
|
#include "zip.hpp"
|
2021-05-24 07:52:36 +02:00
|
|
|
|
2021-05-24 15:35:49 +02:00
|
|
|
#include <boost/regex.hpp>
|
2021-06-01 15:32:10 +02:00
|
|
|
#include <fmt/format.h>
|
|
|
|
#include <fmt/ostream.h> // For compatibility with fmt 4.
|
2021-05-24 15:35:49 +02:00
|
|
|
|
|
|
|
#include <algorithm>
|
2021-05-30 13:31:59 +02:00
|
|
|
#include <array>
|
2021-06-06 21:26:09 +02:00
|
|
|
#include <iterator>
|
|
|
|
#include <memory>
|
2021-05-24 15:35:49 +02:00
|
|
|
#include <string>
|
2021-05-24 07:52:36 +02:00
|
|
|
#include <string_view>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace epubgrep::search
|
|
|
|
{
|
|
|
|
|
2021-06-01 15:32:10 +02:00
|
|
|
using fmt::format;
|
2021-05-26 18:02:27 +02:00
|
|
|
using std::string;
|
|
|
|
|
2021-05-28 19:07:27 +02:00
|
|
|
std::vector<match> search(const fs::path &filepath,
|
|
|
|
const std::string_view regex, const settings &opts)
|
2021-05-24 07:52:36 +02:00
|
|
|
{
|
2021-06-01 17:17:00 +02:00
|
|
|
LOG(log::sev::info)
|
|
|
|
<< format(R"(Starting search in {0:s} using regex "{1:s}")", filepath,
|
|
|
|
regex);
|
2021-05-24 15:35:49 +02:00
|
|
|
boost::regex::flag_type flags{};
|
|
|
|
|
|
|
|
switch (opts.regex)
|
|
|
|
{
|
2021-05-27 17:20:00 +02:00
|
|
|
case options::regex_kind::basic:
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
|
|
|
flags = opts.grep_like ? boost::regex::grep : boost::regex::basic;
|
|
|
|
break;
|
|
|
|
}
|
2021-05-27 17:20:00 +02:00
|
|
|
case options::regex_kind::extended:
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
|
|
|
flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended;
|
|
|
|
break;
|
|
|
|
}
|
2021-05-27 17:20:00 +02:00
|
|
|
case options::regex_kind::perl:
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
|
|
|
flags = boost::regex::perl;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (opts.ignore_case)
|
|
|
|
{
|
|
|
|
flags |= boost::regex::icase;
|
|
|
|
}
|
|
|
|
|
2021-05-27 09:46:59 +02:00
|
|
|
const boost::regex re(regex.data(), flags);
|
2021-05-24 15:35:49 +02:00
|
|
|
std::vector<match> matches;
|
2021-05-29 15:50:03 +02:00
|
|
|
std::vector<string> epub_filepaths{[&opts, &filepath]
|
|
|
|
{
|
|
|
|
if (!opts.raw)
|
|
|
|
{
|
|
|
|
return zip::list_spine(filepath);
|
|
|
|
}
|
|
|
|
return zip::list(filepath);
|
|
|
|
}()};
|
|
|
|
|
|
|
|
for (const auto &entry : epub_filepaths)
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
2021-06-01 15:32:10 +02:00
|
|
|
DEBUGLOG << "Processing " << entry;
|
2021-06-06 21:26:09 +02:00
|
|
|
file_in_epub file;
|
2021-05-24 16:01:41 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
const auto document{zip::read_file(filepath, entry)};
|
|
|
|
if (!opts.raw)
|
|
|
|
{
|
|
|
|
file = cleanup_text(helpers::unescape_html(document));
|
|
|
|
}
|
2021-06-06 22:37:09 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
file.text = document;
|
|
|
|
}
|
2021-05-24 16:01:41 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
string::const_iterator begin{file.text.begin()};
|
|
|
|
string::const_iterator end{file.text.end()};
|
2021-06-06 22:34:52 +02:00
|
|
|
auto begin_text{begin};
|
2021-05-26 18:02:27 +02:00
|
|
|
boost::match_results<string::const_iterator> match_result;
|
2021-05-24 17:18:10 +02:00
|
|
|
|
2021-05-24 15:35:49 +02:00
|
|
|
while (boost::regex_search(begin, end, match_result, re,
|
|
|
|
boost::match_default))
|
|
|
|
{
|
|
|
|
match match; // FIXME: Rename variable or struct.
|
2021-06-01 19:15:00 +02:00
|
|
|
match.filepath_epub = filepath;
|
|
|
|
match.filepath_inside = entry;
|
2021-05-24 15:35:49 +02:00
|
|
|
match.text = match_result[0];
|
|
|
|
match.context = context(match_result, opts.context);
|
2021-06-06 21:26:09 +02:00
|
|
|
const auto pos = static_cast<size_t>(
|
2021-06-06 22:34:52 +02:00
|
|
|
std::distance(begin_text, match_result[0].begin()));
|
2021-06-06 21:26:09 +02:00
|
|
|
match.headline = headline(file, pos);
|
|
|
|
match.page = page(file, pos);
|
2021-05-24 15:35:49 +02:00
|
|
|
|
|
|
|
matches.emplace_back(match);
|
2021-06-06 22:34:52 +02:00
|
|
|
begin = match_result[0].end();
|
2021-05-24 15:35:49 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return matches;
|
|
|
|
}
|
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
file_in_epub cleanup_text(const std::string_view text)
|
2021-05-24 16:23:07 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
string output{text};
|
2021-05-28 19:07:27 +02:00
|
|
|
static const boost::regex re_header_start{"<[hH][1-6]"};
|
2021-05-29 23:00:16 +02:00
|
|
|
static const boost::regex re_header_end{"</[hH][1-6]"};
|
2021-06-05 15:20:40 +02:00
|
|
|
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
|
|
|
|
"(title|aria-label)"
|
|
|
|
"=\"([[:alnum:]]+)\""};
|
2021-05-28 19:07:27 +02:00
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
{
|
2021-06-08 17:30:29 +02:00
|
|
|
size_t pos{0};
|
|
|
|
while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos)
|
|
|
|
{
|
|
|
|
if (output[pos] == '\r')
|
|
|
|
{
|
|
|
|
output.erase(pos, 1);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
output.replace(pos, 1, " ");
|
|
|
|
}
|
|
|
|
}
|
2021-06-06 21:26:09 +02:00
|
|
|
}
|
|
|
|
{
|
2021-06-08 17:30:29 +02:00
|
|
|
size_t pos{0};
|
|
|
|
while ((pos = output.find(" ", pos)) != string::npos)
|
|
|
|
{
|
|
|
|
output.replace(pos, 2, " ");
|
|
|
|
}
|
2021-06-06 21:26:09 +02:00
|
|
|
}
|
|
|
|
|
2021-06-08 17:30:29 +02:00
|
|
|
size_t pos{0};
|
2021-06-06 21:26:09 +02:00
|
|
|
file_in_epub file;
|
|
|
|
size_t headline_start{string::npos};
|
|
|
|
while ((pos = output.find('<', pos)) != string::npos)
|
|
|
|
{
|
|
|
|
auto endpos{output.find('>', pos) + 1};
|
|
|
|
|
|
|
|
if (boost::regex_match(output.substr(pos, 3), re_header_start))
|
2021-05-24 16:37:30 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
headline_start = pos;
|
2021-05-24 16:37:30 +02:00
|
|
|
}
|
2021-06-06 21:26:09 +02:00
|
|
|
else if (boost::regex_match(output.substr(pos, 4), re_header_end))
|
2021-05-24 18:08:40 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
if (headline_start != string::npos)
|
|
|
|
{
|
|
|
|
file.headlines.insert(
|
|
|
|
{headline_start,
|
|
|
|
output.substr(headline_start, pos - headline_start)});
|
|
|
|
headline_start = string::npos;
|
|
|
|
}
|
2021-05-24 18:08:40 +02:00
|
|
|
}
|
2021-06-06 21:26:09 +02:00
|
|
|
else if (output.substr(pos, 6) == "<span ")
|
2021-05-24 18:56:43 +02:00
|
|
|
{
|
2021-06-05 17:45:07 +02:00
|
|
|
boost::match_results<string::const_iterator> match;
|
|
|
|
using it_size_t = string::const_iterator::difference_type;
|
2021-06-06 21:26:09 +02:00
|
|
|
string::const_iterator begin{output.begin()
|
2021-06-05 17:45:07 +02:00
|
|
|
+ static_cast<it_size_t>(pos)};
|
2021-06-06 21:26:09 +02:00
|
|
|
string::const_iterator end{output.begin()
|
2021-06-05 17:45:07 +02:00
|
|
|
+ static_cast<it_size_t>(endpos)};
|
|
|
|
|
|
|
|
if (boost::regex_search(begin, end, match, re_pagebreak))
|
2021-05-24 18:56:43 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
file.pages.insert({pos, match[2].str()});
|
2021-05-24 18:56:43 +02:00
|
|
|
}
|
|
|
|
}
|
2021-06-06 21:26:09 +02:00
|
|
|
else if (output.substr(pos, 7) == "<style "
|
|
|
|
|| output.substr(pos, 8) == "<script ")
|
2021-05-29 18:49:35 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
if (output.find("/>", pos) > endpos)
|
2021-06-06 16:06:14 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
endpos = output.find('>', endpos) + 1;
|
2021-06-06 16:06:14 +02:00
|
|
|
}
|
2021-05-29 18:49:35 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
output.erase(pos, endpos - pos);
|
2021-05-24 16:23:07 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
file.text = output;
|
2021-05-24 16:23:07 +02:00
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
return file;
|
2021-05-24 16:23:07 +02:00
|
|
|
}
|
|
|
|
|
2021-05-26 18:02:27 +02:00
|
|
|
match_context context(const boost::match_results<string::const_iterator> &match,
|
|
|
|
std::uint64_t words)
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
2021-05-24 19:57:15 +02:00
|
|
|
if (words == 0)
|
|
|
|
{
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
2021-05-24 15:35:49 +02:00
|
|
|
++words;
|
|
|
|
|
2021-05-30 13:31:59 +02:00
|
|
|
const auto &rbegin_before{std::reverse_iterator(match.prefix().end())};
|
|
|
|
const auto &rend_before{std::reverse_iterator(match.prefix().begin())};
|
|
|
|
|
|
|
|
const auto &begin_after{match.suffix().begin()};
|
|
|
|
const auto &end_after{match.suffix().end()};
|
|
|
|
|
|
|
|
auto pos_before{rbegin_before};
|
|
|
|
auto pos_after{begin_after};
|
|
|
|
|
|
|
|
const std::array<char, 4> whitespace{' ', '\n', '\r', '\t'};
|
|
|
|
|
2021-05-24 15:35:49 +02:00
|
|
|
while (words != 0)
|
|
|
|
{
|
2021-05-30 13:31:59 +02:00
|
|
|
if (pos_before != rend_before)
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
2021-05-30 13:31:59 +02:00
|
|
|
pos_before = std::find_first_of(pos_before, rend_before,
|
|
|
|
whitespace.begin(),
|
|
|
|
whitespace.end());
|
2021-06-06 23:48:06 +02:00
|
|
|
while (pos_before != rend_before
|
|
|
|
&& helpers::is_whitespace(*pos_before))
|
2021-05-24 19:57:15 +02:00
|
|
|
{
|
2021-06-06 23:48:06 +02:00
|
|
|
++pos_before;
|
2021-05-24 19:57:15 +02:00
|
|
|
}
|
2021-05-24 15:35:49 +02:00
|
|
|
}
|
|
|
|
|
2021-05-30 13:31:59 +02:00
|
|
|
if (pos_after != end_after)
|
2021-05-24 15:35:49 +02:00
|
|
|
{
|
2021-05-30 13:31:59 +02:00
|
|
|
pos_after = std::find_first_of(pos_after, end_after,
|
|
|
|
whitespace.begin(),
|
|
|
|
whitespace.end());
|
2021-06-06 23:48:06 +02:00
|
|
|
while (pos_after != end_after && helpers::is_whitespace(*pos_after))
|
2021-05-24 19:57:15 +02:00
|
|
|
{
|
2021-06-06 23:48:06 +02:00
|
|
|
++pos_after;
|
2021-05-24 19:57:15 +02:00
|
|
|
}
|
2021-05-24 15:35:49 +02:00
|
|
|
}
|
|
|
|
words -= 1;
|
|
|
|
}
|
|
|
|
|
2021-06-05 17:45:07 +02:00
|
|
|
const string before_reversed(rbegin_before, pos_before);
|
2021-05-30 14:47:18 +02:00
|
|
|
string before(before_reversed.rbegin(), before_reversed.rend());
|
2021-06-05 17:45:07 +02:00
|
|
|
string after(begin_after, pos_after);
|
2021-05-30 21:52:52 +02:00
|
|
|
while (helpers::is_whitespace(*before.begin()))
|
2021-05-24 19:57:15 +02:00
|
|
|
{
|
2021-05-30 14:47:18 +02:00
|
|
|
before.erase(0, 1);
|
2021-05-24 19:57:15 +02:00
|
|
|
}
|
2021-05-30 21:52:52 +02:00
|
|
|
while (helpers::is_whitespace(*after.rbegin()))
|
2021-05-24 19:57:15 +02:00
|
|
|
{
|
2021-05-30 14:47:18 +02:00
|
|
|
after.erase(after.size() - 1);
|
2021-05-24 19:57:15 +02:00
|
|
|
}
|
|
|
|
|
2021-05-30 14:47:18 +02:00
|
|
|
return {before, after};
|
2021-05-24 07:52:36 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
std::string headline(const file_in_epub &file, const size_t pos)
|
2021-05-24 17:18:10 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
std::string_view last;
|
2021-05-30 21:16:24 +02:00
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
for (const auto &pair : file.headlines)
|
|
|
|
{
|
|
|
|
if (pair.first > pos)
|
2021-05-30 21:16:24 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
break;
|
2021-05-30 21:16:24 +02:00
|
|
|
}
|
2021-06-06 21:26:09 +02:00
|
|
|
last = pair.second;
|
2021-05-24 17:18:10 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:34:48 +02:00
|
|
|
return string(last);
|
2021-05-24 17:18:10 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:26:09 +02:00
|
|
|
string page(const file_in_epub &file, const size_t pos)
|
2021-05-24 18:56:43 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
std::string_view last;
|
|
|
|
|
|
|
|
for (const auto &pair : file.pages)
|
2021-05-24 18:56:43 +02:00
|
|
|
{
|
2021-06-06 21:26:09 +02:00
|
|
|
if (pair.first > pos)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
last = pair.second;
|
2021-05-24 18:56:43 +02:00
|
|
|
}
|
|
|
|
|
2021-06-06 21:34:48 +02:00
|
|
|
return string(last);
|
2021-05-24 18:56:43 +02:00
|
|
|
}
|
|
|
|
|
2021-05-24 07:52:36 +02:00
|
|
|
} // namespace epubgrep::search
|