epubgrep/src/search.cpp

/*  This file is part of epubgrep.
 *  Copyright © 2021 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "search.hpp"

#include "fs-compat.hpp"
#include "zip.hpp"

#include <boost/regex.hpp>

#include <algorithm>
#include <string>
#include <string_view>
#include <vector>

namespace epubgrep::search
{

using std::string;

std::vector<match> search(const fs::path &filepath, std::string_view regex,
                          const settings &opts)
{
    boost::regex::flag_type flags{};

    switch (opts.regex)
    {
    case options::regex_kind::basic:
    {
        flags = opts.grep_like ? boost::regex::grep : boost::regex::basic;
        break;
    }
    case options::regex_kind::extended:
    {
        flags = opts.grep_like ? boost::regex::egrep : boost::regex::extended;
        break;
    }
    case options::regex_kind::perl:
    {
        flags = boost::regex::perl;
        break;
    }
    }

    if (opts.ignore_case)
    {
        flags |= boost::regex::icase;
    }

    const boost::regex re(regex.data(), flags);
    std::vector<match> matches;
    for (const auto &entry : zip::list(filepath))
    {
        auto document{zip::read_file(filepath, entry)};
        if (!opts.raw)
        {
            cleanup_text(document);
        }

        string::const_iterator begin{document.begin()};
        string::const_iterator end{document.end()};
        boost::match_results<string::const_iterator> match_result;
        string last_headline;
        string last_page;

        while (boost::regex_search(begin, end, match_result, re,
                                   boost::match_default))
        {
            match match; // FIXME: Rename variable or struct.
            match.epub_filepath = filepath;
            match.filepath = entry;
            match.text = match_result[0];
            match.context = context(match_result, opts.context);
            const auto current_headline{headline(match_result.prefix().str())};
            if (!current_headline.empty())
            {
                last_headline = current_headline;
            }
            match.headline = last_headline;
            const auto current_page{page(match_result.prefix().str())};
            if (!current_page.empty())
            {
                last_page = current_page;
            }
            match.page = last_page;

            matches.emplace_back(match);
            begin = match_result[0].second;
        }
    }

    return matches;
}

void cleanup_text(string &text)
{
    size_t pos{};
    while ((pos = text.find('<', pos)) != string::npos)
    {
        // Mark headlines. We need them later on.
        string replacement;
        if (boost::regex_match(text.substr(pos, 3), boost::regex{"<[hH][1-6]"}))
        {
            replacement = "<H>";
        }
        else if (boost::regex_match(text.substr(pos, 3),
                                    boost::regex{"</[hH]"}))
        {
            replacement = "</H>";
        }
        else if (text.substr(pos, 5) == "<span")
        {
            auto endpos{text.find('>')};
            boost::match_results<const char *> match;
            const boost::regex re_pagebreak{".+pagebreak.+(title|aria-label)"
                                            "=\"([[:alnum:]]+)\".*"};
            if (boost::regex_search(text.substr(pos, endpos).data(), match,
                                    re_pagebreak))
            {
                replacement = "<PAGE " + match[2] + ">";
            }
        }
        text.replace(pos, text.find('>', pos) + 1 - pos, replacement);
        pos += replacement.length();
    }

    pos = 0;
    while ((pos = text.find('\r', pos)) != string::npos)
    {
        text.erase(pos, 1);
    }

    pos = 0;
    while ((pos = text.find('\n', pos)) != string::npos)
    {
        text.replace(pos, 1, " ");
    }

    pos = 0;
    while ((pos = text.find("  ", pos)) != string::npos)
    {
        text.replace(pos, 2, " ");
    }
}

match_context context(const boost::match_results<string::const_iterator> &match,
                      std::uint64_t words)
{
    if (words == 0)
    {
        return {};
    }

    const auto &prefix{match.prefix().str()};
    const auto &suffix{match.suffix().str()};
    size_t pos_before{prefix.length()};
    size_t pos_after{};

    ++words;

    while (words != 0)
    {
        if (pos_before != 0)
        {
            pos_before = prefix.rfind(' ', pos_before);
            if (pos_before != string::npos)
            {
                --pos_before;
            }
            else
            {
                pos_before = 0;
            }
        }

        if (pos_after != string::npos)
        {
            pos_after = suffix.find(' ', pos_after);
            if (pos_after != string::npos)
            {
                ++pos_after;
            }
        }
        words -= 1;
    }

    if (pos_before != 0)
    {
        pos_before += 2;
    }
    if (pos_after != string::npos)
    {
        pos_after -= 1;
    }

    return {prefix.substr(pos_before), suffix.substr(0, pos_after)};
}

string headline(const std::string_view prefix)
{
    size_t pos{prefix.length()};
    while ((pos = prefix.rfind("<H>", pos)) != std::string_view::npos)
    {
        pos += 3;
        return string{prefix.substr(pos, prefix.find('<', pos) - pos)};
    }

    return {};
}

string page(const std::string_view prefix)
{
    size_t pos{prefix.length()};
    while ((pos = prefix.rfind("<PAGE ", pos)) != std::string_view::npos)
    {
        pos += 6;
        return string{prefix.substr(pos, prefix.find('>', pos) - pos)};
    }

    return {};
}

} // namespace epubgrep::search