epubgrep/src/zip.cpp

244 lines
7.5 KiB
C++

/* This file is part of epubgrep.
* Copyright © 2021 tastytea <tastytea@tastytea.de>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, version 3.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received zipfile copy of the GNU Affero General Public
* License along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "zip.hpp"
#include "fs-compat.hpp"
#include "helpers.hpp"
#include "log.hpp"
#include <archive.h>
#include <archive_entry.h>
#include <boost/locale/message.hpp>
#include <fmt/format.h>
#include <fmt/ostream.h> // For compatibility with fmt 4.
#include <pugixml.hpp>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>
namespace epubgrep::zip
{
using boost::locale::translate;
using fmt::format;
std::vector<std::string> list(const fs::path &filepath)
{
auto *zipfile{open_file(filepath)};
struct archive_entry *entry{};
std::vector<std::string> toc;
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
{
const auto *in_epub_filepath{archive_entry_pathname_utf8(entry)};
if (in_epub_filepath == nullptr)
{ // If the encoding is broken, we skip the file.
LOG(log::sev::warning)
<< format(translate("File in {0:s} is damaged. "
"Skipping in-EPUB file.\n")
.str()
.data(),
filepath);
continue;
}
toc.emplace_back(in_epub_filepath);
DEBUGLOG << "Found in file: " << in_epub_filepath;
archive_read_data_skip(zipfile);
}
close_file(zipfile, filepath);
return toc;
}
std::string read_file(const fs::path &filepath, std::string_view entry_path)
{
auto *zipfile{open_file(filepath)};
struct archive_entry *entry{};
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
{
const auto *path{archive_entry_pathname_utf8(entry)};
if (path == nullptr)
{ // If the encoding is broken, we skip the file.
LOG(log::sev::warning)
<< format(translate("File in {0:s} is damaged. "
"Skipping in-EPUB file.\n")
.str()
.data(),
filepath);
continue;
}
if (std::strcmp(path, entry_path.data()) == 0)
{
const auto length{static_cast<size_t>(archive_entry_size(entry))};
std::string filecontents;
filecontents.resize(length);
auto result_length{static_cast<size_t>(
archive_read_data(zipfile, &filecontents[0], length))};
if (result_length != length)
{
close_file(zipfile, filepath);
throw exception{
format(translate("Could not read {0:s} in {1:s}.").str(),
entry_path, filepath.string())};
}
close_file(zipfile, filepath);
return filecontents;
}
archive_read_data_skip(zipfile);
}
close_file(zipfile, filepath);
if (entry_path == "META-INF/container.xml")
{ // File is probably not an EPUB.
exception e{format(translate("{0:s} not found in {1:s}.").str(),
entry_path, filepath.string())};
e.code = 1;
throw exception{e};
}
LOG(log::sev::warning)
<< format(translate("{0:s} not found in {1:s}.").str(), entry_path,
filepath.string())
<< '\n';
return {};
}
struct archive *open_file(const fs::path &filepath)
{
// Throw exception if we can't open the file.
std::ifstream file;
file.exceptions(std::ios::failbit);
file.open(filepath);
file.close();
auto *zipfile{archive_read_new()};
archive_read_support_filter_all(zipfile);
archive_read_support_format_zip(zipfile);
auto result{archive_read_open_filename(zipfile, filepath.c_str(), 10240)};
if (result != ARCHIVE_OK)
{
close_file(zipfile, filepath);
exception e{format(translate("Could not open {0:s}.").str(),
filepath.string())};
e.code = 1;
throw exception{e};
}
return zipfile;
}
void close_file(struct archive *zipfile, const fs::path &filepath)
{
auto result{archive_read_free(zipfile)};
if (result != ARCHIVE_OK)
{
throw exception{format(translate("Could not close {0:s}.").str(),
filepath.string())};
}
}
std::vector<std::string> list_spine(const fs::path &filepath)
{
const auto opf_file_path{
[&filepath]
{
pugi::xml_document xml;
const std::string container{
read_file(filepath, "META-INF/container.xml")};
const auto result{xml.load_buffer(&container[0], container.size())};
if (result)
{
return fs::path{xml.child("container")
.child("rootfiles")
.first_child()
.attribute("full-path")
.value()};
}
LOG(log::sev::error) << result.description() << '\n';
return fs::path{};
}()};
std::vector<std::string> spine_filepaths;
if (!opf_file_path.empty())
{
DEBUGLOG << "Parsing " << opf_file_path;
pugi::xml_document xml;
const std::string opf_file{read_file(filepath, opf_file_path.string())};
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
if (result)
{
auto manifest{xml.child("package").child("manifest")};
if (manifest == nullptr)
{
manifest = xml.child("opf:package").child("opf:manifest");
}
auto spine{xml.child("package").child("spine")};
if (spine == nullptr)
{
spine = xml.child("opf:package").child("opf:spine");
}
for (const auto &itemref : spine)
{
const auto &idref{itemref.attribute("idref").value()};
const auto &item{manifest.find_child_by_attribute("id", idref)};
auto href{helpers::urldecode(item.attribute("href").value())};
if (href[0] != '/')
{
href = (opf_file_path.parent_path() /= href);
}
DEBUGLOG << "Found in spine: " << href;
spine_filepaths.emplace_back(href);
}
}
else
{
LOG(log::sev::error) << "XML: " << result.description() << '\n';
}
}
if (opf_file_path.empty() || spine_filepaths.empty())
{
LOG(log::sev::error)
<< format(translate("{0:s} is damaged. Could not read spine. "
"Skipping file.\n")
.str()
.data(),
filepath);
return {};
}
return spine_filepaths;
}
} // namespace epubgrep::zip