epubgrep/src/book.cpp

/*  This file is part of epubgrep.
 *  Copyright © 2021 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "book.hpp"

#include "fs-compat.hpp"
#include "helpers.hpp"
#include "log.hpp"
#include "zip.hpp"

#include <boost/locale/message.hpp>
#include <boost/regex.hpp>
#include <fmt/format.h>
#include <fmt/ostream.h> // For compatibility with fmt 4.
#include <pugixml.hpp>

#include <algorithm>
#include <memory>
#include <string>
#include <string_view>
#include <vector>

namespace epubgrep::book
{

using boost::locale::translate;
using fmt::format;
using std::string;

book read(const fs::path filepath, const bool raw)
{
    using helpers::unescape_html;

    DEBUGLOG << "Processing book " << filepath;

    std::vector<string> epub_filepaths{[&filepath, raw]
                                       {
                                           if (!raw)
                                           {
                                               return list_spine(filepath);
                                           }
                                           return zip::list(filepath);
                                       }()};

    book current_book;
    current_book.language = [&filepath]() -> string
    {
        try
        {
            pugi::xml_document xml;
            auto opf_file_path{get_opf_file_path(filepath)};
            const std::string opf_file{
                zip::read_file(filepath, opf_file_path.string())};

            const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
            if (result)
            {
                auto lang{xml.child("package")
                              .child("metadata")
                              .child("dc:language")};
                if (lang == nullptr)
                {
                    lang = xml.child("opf:package")
                               .child("opf:metadata")
                               .child("dc:language");
                }
                return lang.text().as_string();
            }
        }
        catch (epubgrep::zip::exception &e)
        {
            if (e.code != 1) // 1 == container.xml not found.
            {
                LOG(log::sev::error) << e.what();
            }
        }
        return "";
    }();
    DEBUGLOG << "Book language detected: " << current_book.language;

    for (const auto &entry : epub_filepaths)
    {
        DEBUGLOG << "Processing document " << entry;
        document doc;
        if (!raw)
        {
            doc = process_page(unescape_html(zip::read_file(filepath, entry)));
        }
        else
        {
            doc.text_raw = zip::read_file(filepath, entry);
            doc.text = std::make_unique<std::string>(doc.text_raw);
        }
        doc.language = current_book.language; // FIXME: Get language of doc.
        current_book.files.emplace_back(entry, std::move(doc));
    }

    return current_book;
}

document process_page(const std::string_view text)
{
    string output{text};
    static const boost::regex re_header_start{"<[hH][1-6]"};
    static const boost::regex re_header_end{"</[hH][1-6]"};
    static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
                                           "(title|aria-label)"
                                           "=\"([[:alnum:]]+)\""};

    {
        size_t pos{0};
        while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos)
        {
            if (output[pos] == '\r')
            {
                output.erase(pos, 1);
            }
            else
            {
                output.replace(pos, 1, " ");
            }
        }
    }
    {
        size_t pos{0};
        while ((pos = output.find("  ", pos)) != string::npos)
        {
            output.replace(pos, 2, " ");
        }
    }

    size_t pos{0};
    document doc;
    size_t headline_start{string::npos};
    while ((pos = output.find('<', pos)) != string::npos)
    {
        auto endpos{output.find('>', pos) + 1};

        if (boost::regex_match(output.substr(pos, 3), re_header_start))
        {
            headline_start = pos;
        }
        else if (boost::regex_match(output.substr(pos, 4), re_header_end))
        {
            if (headline_start != string::npos)
            {
                doc.headlines.insert(
                    {headline_start,
                     output.substr(headline_start, pos - headline_start)});
                headline_start = string::npos;
            }
        }
        else if (output.substr(pos, 6) == "<span ")
        {
            boost::match_results<string::const_iterator> match;
            using it_size_t = string::const_iterator::difference_type;
            string::const_iterator begin{output.begin()
                                         + static_cast<it_size_t>(pos)};
            string::const_iterator end{output.begin()
                                       + static_cast<it_size_t>(endpos)};

            if (boost::regex_search(begin, end, match, re_pagebreak))
            {
                doc.pages.insert({pos, match[2].str()});
            }
        }
        else if (output.substr(pos, 7) == "<style "
                 || output.substr(pos, 8) == "<script ")
        {
            if (output.find("/>", pos) > endpos)
            {
                endpos = output.find('>', endpos) + 1;
            }
        }

        output.erase(pos, endpos - pos);
    }

    doc.text_cleaned = output;
    doc.text = std::make_unique<string>(doc.text_cleaned);

    return doc;
}

std::string headline(const document &doc, const size_t pos)
{
    std::string_view last;

    for (const auto &pair : doc.headlines)
    {
        if (pair.first > pos)
        {
            break;
        }
        last = pair.second;
    }

    return string(last);
}

string page(const document &doc, const size_t pos)
{
    std::string_view last;

    for (const auto &pair : doc.pages)
    {
        if (pair.first > pos)
        {
            break;
        }
        last = pair.second;
    }

    return string(last);
}

fs::path get_opf_file_path(const fs::path &zipfile)
{
    pugi::xml_document xml;
    const std::string container{
        zip::read_file(zipfile, "META-INF/container.xml")};
    const auto result{xml.load_buffer(&container[0], container.size())};
    if (result)
    {
        return fs::path{xml.child("container")
                            .child("rootfiles")
                            .first_child()
                            .attribute("full-path")
                            .value()};
    }
    LOG(log::sev::error) << result.description() << '\n';

    return fs::path{};
}

std::vector<string> list_spine(const fs::path &filepath)
{
    auto opf_file_path{get_opf_file_path(filepath)};
    std::vector<std::string> spine_filepaths;
    if (!opf_file_path.empty())
    {
        DEBUGLOG << "Parsing " << opf_file_path;
        pugi::xml_document xml;
        const std::string opf_file{
            zip::read_file(filepath, opf_file_path.string())};
        const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
        if (result)
        {
            auto manifest{xml.child("package").child("manifest")};
            if (manifest == nullptr)
            {
                manifest = xml.child("opf:package").child("opf:manifest");
            }
            auto spine{xml.child("package").child("spine")};
            if (spine == nullptr)
            {
                spine = xml.child("opf:package").child("opf:spine");
            }

            for (const auto &itemref : spine)
            {
                const auto &idref{itemref.attribute("idref").value()};
                const auto &item{manifest.find_child_by_attribute("id", idref)};
                auto href{helpers::urldecode(item.attribute("href").value())};
                if (href[0] != '/')
                {
                    href = (opf_file_path.parent_path() /= href);
                }
                DEBUGLOG << "Found in spine: " << href;
                spine_filepaths.emplace_back(href);
            }
        }
        else
        {
            LOG(log::sev::error) << "XML: " << result.description() << '\n';
        }
    }

    if (opf_file_path.empty() || spine_filepaths.empty())
    {
        LOG(log::sev::error)
            << format(translate("{0:s} is damaged. Could not read spine. "
                                "Skipping file.\n")
                          .str()
                          .data(),
                      filepath);
        return {};
    }

    return spine_filepaths;
}

} // namespace epubgrep::book