307 lines
9.0 KiB
C++
307 lines
9.0 KiB
C++
/* This file is part of epubgrep.
|
|
* Copyright © 2021 tastytea <tastytea@tastytea.de>
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
* the Free Software Foundation, version 3.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Affero General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "book.hpp"
|
|
|
|
#include "fs-compat.hpp"
|
|
#include "helpers.hpp"
|
|
#include "log.hpp"
|
|
#include "zip.hpp"
|
|
|
|
#include <boost/locale/message.hpp>
|
|
#include <boost/regex.hpp>
|
|
#include <fmt/format.h>
|
|
#include <fmt/ostream.h> // For compatibility with fmt 4.
|
|
#include <pugixml.hpp>
|
|
|
|
#include <algorithm>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <vector>
|
|
|
|
namespace epubgrep::book
|
|
{
|
|
|
|
using boost::locale::translate;
|
|
using fmt::format;
|
|
using std::string;
|
|
|
|
book read(const fs::path filepath, const bool raw)
|
|
{
|
|
using helpers::unescape_html;
|
|
|
|
DEBUGLOG << "Processing book " << filepath;
|
|
|
|
std::vector<string> epub_filepaths{[&filepath, raw]
|
|
{
|
|
if (!raw)
|
|
{
|
|
return list_spine(filepath);
|
|
}
|
|
return zip::list(filepath);
|
|
}()};
|
|
|
|
book current_book;
|
|
current_book.language = [&filepath]() -> string
|
|
{
|
|
try
|
|
{
|
|
pugi::xml_document xml;
|
|
auto opf_file_path{get_opf_file_path(filepath)};
|
|
const std::string opf_file{
|
|
zip::read_file(filepath, opf_file_path.string())};
|
|
|
|
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
|
|
if (result)
|
|
{
|
|
auto lang{xml.child("package")
|
|
.child("metadata")
|
|
.child("dc:language")};
|
|
if (lang == nullptr)
|
|
{
|
|
lang = xml.child("opf:package")
|
|
.child("opf:metadata")
|
|
.child("dc:language");
|
|
}
|
|
return lang.text().as_string();
|
|
}
|
|
}
|
|
catch (epubgrep::zip::exception &e)
|
|
{
|
|
if (e.code != 1) // 1 == container.xml not found.
|
|
{
|
|
LOG(log::sev::error) << e.what();
|
|
}
|
|
}
|
|
return "";
|
|
}();
|
|
DEBUGLOG << "Book language detected: " << current_book.language;
|
|
|
|
for (const auto &entry : epub_filepaths)
|
|
{
|
|
DEBUGLOG << "Processing document " << entry;
|
|
document doc;
|
|
if (!raw)
|
|
{
|
|
doc = process_page(unescape_html(zip::read_file(filepath, entry)));
|
|
}
|
|
else
|
|
{
|
|
doc.text_raw = zip::read_file(filepath, entry);
|
|
doc.text = std::make_unique<std::string>(doc.text_raw);
|
|
}
|
|
doc.language = current_book.language; // FIXME: Get language of doc.
|
|
current_book.files.emplace_back(entry, std::move(doc));
|
|
}
|
|
|
|
return current_book;
|
|
}
|
|
|
|
document process_page(const std::string_view text)
|
|
{
|
|
string output{text};
|
|
static const boost::regex re_header_start{"<[hH][1-6]"};
|
|
static const boost::regex re_header_end{"</[hH][1-6]"};
|
|
static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
|
|
"(title|aria-label)"
|
|
"=\"([[:alnum:]]+)\""};
|
|
|
|
{
|
|
size_t pos{0};
|
|
while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos)
|
|
{
|
|
if (output[pos] == '\r')
|
|
{
|
|
output.erase(pos, 1);
|
|
}
|
|
else
|
|
{
|
|
output.replace(pos, 1, " ");
|
|
}
|
|
}
|
|
}
|
|
{
|
|
size_t pos{0};
|
|
while ((pos = output.find(" ", pos)) != string::npos)
|
|
{
|
|
output.replace(pos, 2, " ");
|
|
}
|
|
}
|
|
|
|
size_t pos{0};
|
|
document doc;
|
|
size_t headline_start{string::npos};
|
|
while ((pos = output.find('<', pos)) != string::npos)
|
|
{
|
|
auto endpos{output.find('>', pos) + 1};
|
|
|
|
if (boost::regex_match(output.substr(pos, 3), re_header_start))
|
|
{
|
|
headline_start = pos;
|
|
}
|
|
else if (boost::regex_match(output.substr(pos, 4), re_header_end))
|
|
{
|
|
if (headline_start != string::npos)
|
|
{
|
|
doc.headlines.insert(
|
|
{headline_start,
|
|
output.substr(headline_start, pos - headline_start)});
|
|
headline_start = string::npos;
|
|
}
|
|
}
|
|
else if (output.substr(pos, 6) == "<span ")
|
|
{
|
|
boost::match_results<string::const_iterator> match;
|
|
using it_size_t = string::const_iterator::difference_type;
|
|
string::const_iterator begin{output.begin()
|
|
+ static_cast<it_size_t>(pos)};
|
|
string::const_iterator end{output.begin()
|
|
+ static_cast<it_size_t>(endpos)};
|
|
|
|
if (boost::regex_search(begin, end, match, re_pagebreak))
|
|
{
|
|
doc.pages.insert({pos, match[2].str()});
|
|
}
|
|
}
|
|
else if (output.substr(pos, 7) == "<style "
|
|
|| output.substr(pos, 8) == "<script ")
|
|
{
|
|
if (output.find("/>", pos) > endpos)
|
|
{
|
|
endpos = output.find('>', endpos) + 1;
|
|
}
|
|
}
|
|
|
|
output.erase(pos, endpos - pos);
|
|
}
|
|
|
|
doc.text_cleaned = output;
|
|
doc.text = std::make_unique<string>(doc.text_cleaned);
|
|
|
|
return doc;
|
|
}
|
|
|
|
std::string headline(const document &doc, const size_t pos)
|
|
{
|
|
std::string_view last;
|
|
|
|
for (const auto &pair : doc.headlines)
|
|
{
|
|
if (pair.first > pos)
|
|
{
|
|
break;
|
|
}
|
|
last = pair.second;
|
|
}
|
|
|
|
return string(last);
|
|
}
|
|
|
|
string page(const document &doc, const size_t pos)
|
|
{
|
|
std::string_view last;
|
|
|
|
for (const auto &pair : doc.pages)
|
|
{
|
|
if (pair.first > pos)
|
|
{
|
|
break;
|
|
}
|
|
last = pair.second;
|
|
}
|
|
|
|
return string(last);
|
|
}
|
|
|
|
fs::path get_opf_file_path(const fs::path &zipfile)
|
|
{
|
|
pugi::xml_document xml;
|
|
const std::string container{
|
|
zip::read_file(zipfile, "META-INF/container.xml")};
|
|
const auto result{xml.load_buffer(&container[0], container.size())};
|
|
if (result)
|
|
{
|
|
return fs::path{xml.child("container")
|
|
.child("rootfiles")
|
|
.first_child()
|
|
.attribute("full-path")
|
|
.value()};
|
|
}
|
|
LOG(log::sev::error) << result.description() << '\n';
|
|
|
|
return fs::path{};
|
|
}
|
|
|
|
std::vector<string> list_spine(const fs::path &filepath)
|
|
{
|
|
auto opf_file_path{get_opf_file_path(filepath)};
|
|
std::vector<std::string> spine_filepaths;
|
|
if (!opf_file_path.empty())
|
|
{
|
|
DEBUGLOG << "Parsing " << opf_file_path;
|
|
pugi::xml_document xml;
|
|
const std::string opf_file{
|
|
zip::read_file(filepath, opf_file_path.string())};
|
|
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
|
|
if (result)
|
|
{
|
|
auto manifest{xml.child("package").child("manifest")};
|
|
if (manifest == nullptr)
|
|
{
|
|
manifest = xml.child("opf:package").child("opf:manifest");
|
|
}
|
|
auto spine{xml.child("package").child("spine")};
|
|
if (spine == nullptr)
|
|
{
|
|
spine = xml.child("opf:package").child("opf:spine");
|
|
}
|
|
|
|
for (const auto &itemref : spine)
|
|
{
|
|
const auto &idref{itemref.attribute("idref").value()};
|
|
const auto &item{manifest.find_child_by_attribute("id", idref)};
|
|
auto href{helpers::urldecode(item.attribute("href").value())};
|
|
if (href[0] != '/')
|
|
{
|
|
href = (opf_file_path.parent_path() /= href);
|
|
}
|
|
DEBUGLOG << "Found in spine: " << href;
|
|
spine_filepaths.emplace_back(href);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
LOG(log::sev::error) << "XML: " << result.description() << '\n';
|
|
}
|
|
}
|
|
|
|
if (opf_file_path.empty() || spine_filepaths.empty())
|
|
{
|
|
LOG(log::sev::error)
|
|
<< format(translate("{0:s} is damaged. Could not read spine. "
|
|
"Skipping file.\n")
|
|
.str()
|
|
.data(),
|
|
filepath);
|
|
return {};
|
|
}
|
|
|
|
return spine_filepaths;
|
|
}
|
|
|
|
} // namespace epubgrep::book
|