2021-05-24 07:52:36 +02:00
|
|
|
/* This file is part of epubgrep.
|
|
|
|
* Copyright © 2021 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "search.hpp"
|
|
|
|
|
|
|
|
#include "fs-compat.hpp"
|
2021-05-24 15:35:49 +02:00
|
|
|
#include "zip.hpp"
|
2021-05-24 07:52:36 +02:00
|
|
|
|
2021-05-24 15:35:49 +02:00
|
|
|
#include <boost/regex.hpp>
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <string>
|
2021-05-24 07:52:36 +02:00
|
|
|
#include <string_view>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace epubgrep::search
|
|
|
|
{
|
|
|
|
|
|
|
|
std::vector<match> search(const fs::path &filepath, std::string_view regex,
|
|
|
|
const options &opts)
|
|
|
|
{
|
2021-05-24 15:35:49 +02:00
|
|
|
boost::regex::flag_type flags{};
|
|
|
|
|
|
|
|
switch (opts.regex)
|
|
|
|
{
|
|
|
|
case regex_kind::basic:
|
|
|
|
{
|
|
|
|
flags = opts.grep_like ? boost::regex::grep : boost::regex::basic;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case regex_kind::extended:
|
|
|
|
{
|
|
|
|
flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case regex_kind::perl:
|
|
|
|
{
|
|
|
|
flags = boost::regex::perl;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (opts.ignore_case)
|
|
|
|
{
|
|
|
|
flags |= boost::regex::icase;
|
|
|
|
}
|
|
|
|
|
|
|
|
boost::regex re(regex.data(), flags);
|
|
|
|
std::vector<match> matches;
|
|
|
|
for (const auto &entry : zip::list(filepath))
|
|
|
|
{
|
|
|
|
auto document{zip::read_file(filepath, entry)};
|
2021-05-24 16:01:41 +02:00
|
|
|
if (!opts.raw)
|
|
|
|
{
|
|
|
|
cleanup_text(document);
|
|
|
|
}
|
|
|
|
|
2021-05-24 15:35:49 +02:00
|
|
|
std::string::const_iterator begin{document.begin()};
|
|
|
|
std::string::const_iterator end{document.end()};
|
|
|
|
boost::match_results<std::string::const_iterator> match_result;
|
|
|
|
while (boost::regex_search(begin, end, match_result, re,
|
|
|
|
boost::match_default))
|
|
|
|
{
|
|
|
|
match match; // FIXME: Rename variable or struct.
|
|
|
|
match.filepath = entry;
|
|
|
|
match.text = match_result[0];
|
|
|
|
match.context = context(match_result, opts.context);
|
|
|
|
|
|
|
|
matches.emplace_back(match);
|
|
|
|
begin = match_result[0].second;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return matches;
|
|
|
|
}
|
|
|
|
|
|
|
|
match_context
|
|
|
|
context(const boost::match_results<std::string::const_iterator> &match,
|
|
|
|
std::uint64_t words)
|
|
|
|
{
|
|
|
|
const auto &prefix{match.prefix().str()};
|
|
|
|
const auto &suffix{match.suffix().str()};
|
|
|
|
size_t pos_before{prefix.length()};
|
|
|
|
size_t pos_after{};
|
|
|
|
++words;
|
|
|
|
|
|
|
|
while (words != 0)
|
|
|
|
{
|
|
|
|
if (pos_before != std::string::npos)
|
|
|
|
{
|
|
|
|
pos_before = prefix.rfind(' ', pos_before) - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pos_after != std::string::npos)
|
|
|
|
{
|
|
|
|
pos_after = suffix.find(' ', pos_after) + 1;
|
|
|
|
}
|
|
|
|
words -= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return {prefix.substr(pos_before + 2), suffix.substr(0, pos_after - 1)};
|
2021-05-24 07:52:36 +02:00
|
|
|
}
|
|
|
|
|
2021-05-24 16:01:41 +02:00
|
|
|
void cleanup_text(std::string &text)
|
|
|
|
{
|
|
|
|
for (size_t pos{}; pos != std::string::npos; pos = text.find('<', pos))
|
|
|
|
{
|
|
|
|
text.erase(pos, text.find('>', pos) + 1 - pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t pos{}; pos != std::string::npos; pos = text.find('\r', pos))
|
|
|
|
{
|
|
|
|
text.replace(pos, 1, "");
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t pos{}; pos != std::string::npos; pos = text.find('\n', pos))
|
|
|
|
{
|
|
|
|
text.replace(pos, 1, " ");
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t pos{}; pos != std::string::npos; pos = text.find(" ", pos))
|
|
|
|
{
|
|
|
|
text.replace(pos, 2, " ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-24 07:52:36 +02:00
|
|
|
} // namespace epubgrep::search
|