2021-05-21 01:56:37 +02:00
|
|
|
/* This file is part of epubgrep.
|
|
|
|
* Copyright © 2021 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received zipfile copy of the GNU Affero General Public
|
|
|
|
* License along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "zip.hpp"
|
|
|
|
|
|
|
|
#include "fs-compat.hpp"
|
|
|
|
|
|
|
|
#include <archive.h>
|
|
|
|
#include <archive_entry.h>
|
2021-05-21 03:25:42 +02:00
|
|
|
#include <boost/locale/message.hpp>
|
|
|
|
#include <fmt/format.h>
|
|
|
|
#include <fmt/ostream.h> // For compatibility with fmt 4.
|
2021-05-29 15:50:03 +02:00
|
|
|
#include <pugixml.hpp>
|
2021-05-21 01:56:37 +02:00
|
|
|
|
2021-05-23 08:56:58 +02:00
|
|
|
#include <cstdlib>
|
|
|
|
#include <cstring>
|
2021-05-27 21:39:01 +02:00
|
|
|
#include <fstream>
|
2021-05-28 13:55:11 +02:00
|
|
|
#include <iostream>
|
2021-05-27 21:39:01 +02:00
|
|
|
#include <stdexcept>
|
2021-05-21 01:56:37 +02:00
|
|
|
#include <string>
|
2021-05-29 15:50:03 +02:00
|
|
|
#include <string_view>
|
2021-05-21 01:56:37 +02:00
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace epubgrep::zip
|
|
|
|
{
|
|
|
|
|
2021-05-21 03:25:42 +02:00
|
|
|
using boost::locale::translate;
|
|
|
|
using fmt::format;
|
2021-05-29 18:12:56 +02:00
|
|
|
using std::cerr;
|
2021-05-21 03:25:42 +02:00
|
|
|
|
2021-05-21 01:56:37 +02:00
|
|
|
std::vector<std::string> list(const fs::path &filepath)
|
2021-05-23 08:56:58 +02:00
|
|
|
{
|
|
|
|
auto *zipfile{open_file(filepath)};
|
|
|
|
|
|
|
|
struct archive_entry *entry{};
|
|
|
|
std::vector<std::string> toc;
|
|
|
|
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
|
|
|
|
{
|
2021-05-28 13:55:11 +02:00
|
|
|
const auto *in_epub_filepath{archive_entry_pathname_utf8(entry)};
|
|
|
|
if (in_epub_filepath == nullptr)
|
|
|
|
{ // If the encoding is broken, we skip the file.
|
|
|
|
std::cerr << translate("WARNING: ")
|
2021-05-29 15:50:03 +02:00
|
|
|
<< format(translate("File in {0:s} is damaged. "
|
|
|
|
"Skipping in-EPUB file.\n")
|
2021-05-28 13:55:11 +02:00
|
|
|
.str()
|
|
|
|
.data(),
|
|
|
|
filepath);
|
2021-05-29 15:50:03 +02:00
|
|
|
continue;
|
2021-05-28 13:55:11 +02:00
|
|
|
}
|
|
|
|
toc.emplace_back(in_epub_filepath);
|
2021-05-23 08:56:58 +02:00
|
|
|
archive_read_data_skip(zipfile);
|
|
|
|
}
|
2021-05-27 20:11:59 +02:00
|
|
|
|
2021-05-23 08:56:58 +02:00
|
|
|
close_file(zipfile, filepath);
|
|
|
|
|
|
|
|
return toc;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string read_file(const fs::path &filepath, std::string_view entry_path)
|
|
|
|
{
|
|
|
|
auto *zipfile{open_file(filepath)};
|
|
|
|
|
|
|
|
struct archive_entry *entry{};
|
|
|
|
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
|
|
|
|
{
|
|
|
|
const auto *path{archive_entry_pathname_utf8(entry)};
|
2021-05-29 15:50:03 +02:00
|
|
|
if (path == nullptr)
|
|
|
|
{ // If the encoding is broken, we skip the file.
|
|
|
|
std::cerr << translate("WARNING: ")
|
|
|
|
<< format(translate("File in {0:s} is damaged. "
|
|
|
|
"Skipping in-EPUB file.\n")
|
|
|
|
.str()
|
|
|
|
.data(),
|
|
|
|
filepath);
|
|
|
|
continue;
|
|
|
|
}
|
2021-05-23 08:56:58 +02:00
|
|
|
if (std::strcmp(path, entry_path.data()) == 0)
|
|
|
|
{
|
|
|
|
const auto length{static_cast<size_t>(archive_entry_size(entry))};
|
|
|
|
std::string filecontents;
|
|
|
|
filecontents.resize(length);
|
|
|
|
auto result_length{static_cast<size_t>(
|
|
|
|
archive_read_data(zipfile, &filecontents[0], length))};
|
|
|
|
|
|
|
|
if (result_length != length)
|
|
|
|
{
|
2021-05-27 20:11:59 +02:00
|
|
|
close_file(zipfile, filepath);
|
|
|
|
|
2021-05-23 08:56:58 +02:00
|
|
|
throw exception{
|
|
|
|
format(translate("Could not read {0:s} in {1:s}.").str(),
|
|
|
|
entry_path, filepath.string())};
|
|
|
|
}
|
|
|
|
|
2021-05-27 19:49:32 +02:00
|
|
|
close_file(zipfile, filepath);
|
2021-05-27 20:11:59 +02:00
|
|
|
|
2021-05-23 08:56:58 +02:00
|
|
|
return filecontents;
|
|
|
|
}
|
|
|
|
archive_read_data_skip(zipfile);
|
|
|
|
}
|
|
|
|
|
|
|
|
close_file(zipfile, filepath);
|
|
|
|
|
2021-05-29 18:12:56 +02:00
|
|
|
if (entry_path == "META-INF/container.xml")
|
|
|
|
{ // File is probably not an EPUB.
|
|
|
|
exception e{format(translate("{0:s} not found in {1:s}.").str(),
|
2021-05-23 08:56:58 +02:00
|
|
|
entry_path, filepath.string())};
|
2021-05-29 18:12:56 +02:00
|
|
|
e.code = 1;
|
|
|
|
throw exception{e};
|
|
|
|
}
|
|
|
|
|
|
|
|
cerr << translate("WARNING: ")
|
|
|
|
<< format(translate("{0:s} not found in {1:s}.").str(), entry_path,
|
|
|
|
filepath.string())
|
|
|
|
<< '\n';
|
|
|
|
return {};
|
2021-05-23 08:56:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
struct archive *open_file(const fs::path &filepath)
|
2021-05-21 01:56:37 +02:00
|
|
|
{
|
2021-05-29 12:42:29 +02:00
|
|
|
// Throw exception if we can't open the file.
|
|
|
|
std::ifstream file;
|
|
|
|
file.exceptions(std::ios::failbit);
|
|
|
|
file.open(filepath);
|
2021-05-27 21:39:01 +02:00
|
|
|
file.close();
|
|
|
|
|
2021-05-21 07:05:44 +02:00
|
|
|
auto *zipfile{archive_read_new()};
|
2021-05-21 01:56:37 +02:00
|
|
|
archive_read_support_filter_all(zipfile);
|
2021-05-23 08:56:58 +02:00
|
|
|
archive_read_support_format_zip(zipfile);
|
2021-05-21 01:56:37 +02:00
|
|
|
|
2021-05-21 07:05:44 +02:00
|
|
|
auto result{archive_read_open_filename(zipfile, filepath.c_str(), 10240)};
|
2021-05-21 01:56:37 +02:00
|
|
|
if (result != ARCHIVE_OK)
|
|
|
|
{
|
2021-05-27 20:11:59 +02:00
|
|
|
close_file(zipfile, filepath);
|
|
|
|
|
2021-05-27 21:39:01 +02:00
|
|
|
exception e{format(translate("Could not open {0:s}.").str(),
|
|
|
|
filepath.string())};
|
|
|
|
e.code = 1;
|
|
|
|
throw exception{e};
|
2021-05-21 01:56:37 +02:00
|
|
|
}
|
|
|
|
|
2021-05-23 08:56:58 +02:00
|
|
|
return zipfile;
|
|
|
|
}
|
2021-05-21 01:56:37 +02:00
|
|
|
|
2021-05-23 08:56:58 +02:00
|
|
|
void close_file(struct archive *zipfile, const fs::path &filepath)
|
|
|
|
{
|
|
|
|
auto result{archive_read_free(zipfile)};
|
2021-05-21 01:56:37 +02:00
|
|
|
if (result != ARCHIVE_OK)
|
|
|
|
{
|
2021-05-21 04:02:50 +02:00
|
|
|
throw exception{format(translate("Could not close {0:s}.").str(),
|
2021-05-21 03:25:42 +02:00
|
|
|
filepath.string())};
|
2021-05-21 01:56:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-29 15:50:03 +02:00
|
|
|
std::vector<std::string> list_spine(const fs::path &filepath)
|
|
|
|
{
|
|
|
|
const fs::path opf_file_path{
|
|
|
|
[&filepath]
|
|
|
|
{
|
|
|
|
pugi::xml_document xml;
|
|
|
|
const std::string container{
|
|
|
|
read_file(filepath, "META-INF/container.xml")};
|
|
|
|
const auto result{xml.load_buffer(&container[0], container.size())};
|
|
|
|
if (result)
|
|
|
|
{
|
|
|
|
return xml.child("container")
|
|
|
|
.child("rootfiles")
|
|
|
|
.first_child()
|
|
|
|
.attribute("full-path")
|
|
|
|
.value();
|
|
|
|
}
|
2021-05-29 18:38:46 +02:00
|
|
|
cerr << translate("ERROR: ") << result.description() << '\n';
|
2021-05-29 15:50:03 +02:00
|
|
|
|
|
|
|
return "";
|
|
|
|
}()};
|
|
|
|
|
|
|
|
std::vector<std::string> spine_filepaths;
|
|
|
|
if (!opf_file_path.empty())
|
|
|
|
{
|
|
|
|
pugi::xml_document xml;
|
|
|
|
const std::string opf_file{read_file(filepath, opf_file_path.string())};
|
|
|
|
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
|
|
|
|
if (result)
|
|
|
|
{
|
|
|
|
auto manifest{xml.child("package").child("manifest")};
|
2021-05-29 18:09:44 +02:00
|
|
|
if (manifest == nullptr)
|
|
|
|
{
|
|
|
|
manifest = xml.child("opf:package").child("opf:manifest");
|
|
|
|
}
|
2021-05-29 15:50:03 +02:00
|
|
|
auto spine{xml.child("package").child("spine")};
|
2021-05-29 18:09:44 +02:00
|
|
|
if (spine == nullptr)
|
|
|
|
{
|
|
|
|
spine = xml.child("opf:package").child("opf:spine");
|
|
|
|
}
|
2021-05-29 15:50:03 +02:00
|
|
|
|
|
|
|
for (const auto &itemref : spine)
|
|
|
|
{
|
|
|
|
const auto &idref{itemref.attribute("idref").value()};
|
|
|
|
const auto &item{manifest.find_child_by_attribute("id", idref)};
|
|
|
|
const std::string href{
|
|
|
|
urldecode(item.attribute("href").value())};
|
|
|
|
if (href[0] != '/')
|
|
|
|
{
|
|
|
|
spine_filepaths.emplace_back(
|
|
|
|
opf_file_path.parent_path() /= href);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
spine_filepaths.emplace_back(href);
|
|
|
|
}
|
|
|
|
}
|
2021-05-29 18:38:46 +02:00
|
|
|
else
|
|
|
|
{
|
|
|
|
cerr << translate("ERROR: ") << "XML: " << result.description()
|
|
|
|
<< '\n';
|
|
|
|
}
|
2021-05-29 15:50:03 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if (opf_file_path.empty() || spine_filepaths.empty())
|
|
|
|
{
|
|
|
|
std::cerr << translate("ERROR: ")
|
|
|
|
<< format(translate("{0:s} is damaged. Could not read spine. "
|
|
|
|
"Skipping file.\n")
|
|
|
|
.str()
|
|
|
|
.data(),
|
|
|
|
filepath);
|
|
|
|
return {};
|
|
|
|
}
|
|
|
|
|
|
|
|
return spine_filepaths;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string urldecode(const std::string_view url)
|
|
|
|
{ // RFC 3986, section 2.1.
|
|
|
|
size_t pos{0};
|
|
|
|
size_t lastpos{0};
|
|
|
|
std::string decoded;
|
|
|
|
while ((pos = url.find('%', pos)) != std::string_view::npos)
|
|
|
|
{
|
|
|
|
decoded += url.substr(lastpos, pos - lastpos);
|
|
|
|
decoded += static_cast<char>(
|
|
|
|
std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
|
|
|
|
pos += 3;
|
|
|
|
lastpos = pos;
|
|
|
|
}
|
|
|
|
decoded += url.substr(lastpos);
|
|
|
|
|
|
|
|
return decoded;
|
|
|
|
}
|
|
|
|
|
2021-05-21 01:56:37 +02:00
|
|
|
} // namespace epubgrep::zip
|