epubgrep/src/search.cpp

/*  This file is part of epubgrep.
 *  Copyright © 2021 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "search.hpp"

#include "fs-compat.hpp"
#include "helpers.hpp"
#include "log.hpp"
#include "zip.hpp"

#include <boost/regex.hpp>
#include <fmt/format.h>
#include <fmt/ostream.h> // For compatibility with fmt 4.

#include <algorithm>
#include <array>
#include <iterator>
#include <memory>
#include <string>
#include <string_view>
#include <vector>

namespace epubgrep::search
{

using fmt::format;
using std::string;

std::vector<match> search(const fs::path &filepath,
                          const std::string_view regex, const settings &opts)
{
    LOG(log::sev::info)
        << format(R"(Starting search in {0:s} using regex "{1:s}")", filepath,
                  regex);
    boost::regex::flag_type flags{};

    switch (opts.regex)
    {
    case options::regex_kind::basic:
    {
        flags = opts.grep_like ? boost::regex::grep : boost::regex::basic;
        break;
    }
    case options::regex_kind::extended:
    {
        flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended;
        break;
    }
    case options::regex_kind::perl:
    {
        flags = boost::regex::perl;
        break;
    }
    }

    if (opts.ignore_case)
    {
        flags |= boost::regex::icase;
    }

    const boost::regex re(regex.data(), flags);
    std::vector<match> matches;
    std::vector<string> epub_filepaths{[&opts, &filepath]
                                       {
                                           if (!opts.raw)
                                           {
                                               return zip::list_spine(filepath);
                                           }
                                           return zip::list(filepath);
                                       }()};

    for (const auto &entry : epub_filepaths)
    {
        DEBUGLOG << "Processing " << entry;
        file_in_epub file;
        {
            const auto document{zip::read_file(filepath, entry)};
            if (!opts.raw)
            {
                file = cleanup_text(helpers::unescape_html(document));
            }
            else
            {
                file.text = document;
            }
        }

        string::const_iterator begin{file.text.begin()};
        string::const_iterator end{file.text.end()};
        auto begin_text{begin};
        boost::match_results<string::const_iterator> match_result;

        while (boost::regex_search(begin, end, match_result, re,
                                   boost::match_default))
        {
            match match; // FIXME: Rename variable or struct.
            match.filepath_epub = filepath;
            match.filepath_inside = entry;
            match.text = match_result[0];
            match.context = context(match_result, opts.context);
            const auto pos = static_cast<size_t>(
                std::distance(begin_text, match_result[0].begin()));
            match.headline = headline(file, pos);
            match.page = page(file, pos);

            matches.emplace_back(match);
            begin = match_result[0].end();
        }
    }

    return matches;
}

file_in_epub cleanup_text(const std::string_view text)
{
    string output{text};
    static const boost::regex re_header_start{"<[hH][1-6]"};
    static const boost::regex re_header_end{"</[hH][1-6]"};
    static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
                                           "(title|aria-label)"
                                           "=\"([[:alnum:]]+)\""};

    // TODO: Make this more efficient, 3 → 1;
    size_t pos{0};
    while ((pos = output.find('\r', pos)) != string::npos)
    {
        output.erase(pos, 1);
    }

    pos = 0;
    while ((pos = output.find_first_of("\n\t", pos)) != string::npos)
    {
        output.replace(pos, 1, " ");
    }

    pos = 0;
    while ((pos = output.find("  ", pos)) != string::npos)
    {
        output.replace(pos, 2, " ");
    }

    pos = 0;
    file_in_epub file;
    size_t headline_start{string::npos};
    while ((pos = output.find('<', pos)) != string::npos)
    {
        auto endpos{output.find('>', pos) + 1};

        if (boost::regex_match(output.substr(pos, 3), re_header_start))
        {
            headline_start = pos;
        }
        else if (boost::regex_match(output.substr(pos, 4), re_header_end))
        {
            if (headline_start != string::npos)
            {
                file.headlines.insert(
                    {headline_start,
                     output.substr(headline_start, pos - headline_start)});
                headline_start = string::npos;
            }
        }
        else if (output.substr(pos, 6) == "<span ")
        {
            boost::match_results<string::const_iterator> match;
            using it_size_t = string::const_iterator::difference_type;
            string::const_iterator begin{output.begin()
                                         + static_cast<it_size_t>(pos)};
            string::const_iterator end{output.begin()
                                       + static_cast<it_size_t>(endpos)};

            if (boost::regex_search(begin, end, match, re_pagebreak))
            {
                file.pages.insert({pos, match[2].str()});
            }
        }
        else if (output.substr(pos, 7) == "<style "
                 || output.substr(pos, 8) == "<script ")
        {
            if (output.find("/>", pos) > endpos)
            {
                endpos = output.find('>', endpos) + 1;
            }
        }

        output.erase(pos, endpos - pos);
    }

    file.text = output;

    return file;
}

match_context context(const boost::match_results<string::const_iterator> &match,
                      std::uint64_t words)
{
    if (words == 0)
    {
        return {};
    }

    ++words;

    const auto &rbegin_before{std::reverse_iterator(match.prefix().end())};
    const auto &rend_before{std::reverse_iterator(match.prefix().begin())};

    const auto &begin_after{match.suffix().begin()};
    const auto &end_after{match.suffix().end()};

    auto pos_before{rbegin_before};
    auto pos_after{begin_after};

    const std::array<char, 4> whitespace{' ', '\n', '\r', '\t'};

    while (words != 0)
    {
        if (pos_before != rend_before)
        {
            pos_before = std::find_first_of(pos_before, rend_before,
                                            whitespace.begin(),
                                            whitespace.end());
            while (pos_before != rend_before
                   && helpers::is_whitespace(*pos_before))
            {
                ++pos_before;
            }
        }

        if (pos_after != end_after)
        {
            pos_after = std::find_first_of(pos_after, end_after,
                                           whitespace.begin(),
                                           whitespace.end());
            while (pos_after != end_after && helpers::is_whitespace(*pos_after))
            {
                ++pos_after;
            }
        }
        words -= 1;
    }

    const string before_reversed(rbegin_before, pos_before);
    string before(before_reversed.rbegin(), before_reversed.rend());
    string after(begin_after, pos_after);
    while (helpers::is_whitespace(*before.begin()))
    {
        before.erase(0, 1);
    }
    while (helpers::is_whitespace(*after.rbegin()))
    {
        after.erase(after.size() - 1);
    }

    return {before, after};
}

std::string headline(const file_in_epub &file, const size_t pos)
{
    std::string_view last;

    for (const auto &pair : file.headlines)
    {
        if (pair.first > pos)
        {
            break;
        }
        last = pair.second;
    }

    return string(last);
}

string page(const file_in_epub &file, const size_t pos)
{
    std::string_view last;

    for (const auto &pair : file.pages)
    {
        if (pair.first > pos)
        {
            break;
        }
        last = pair.second;
    }

    return string(last);
}

} // namespace epubgrep::search