2021-08-17 13:05:14 +02:00
|
|
|
/* This file is part of epubgrep.
|
|
|
|
* Copyright © 2021 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU Affero General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU Affero General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef EPUBGREP_BOOK_HPP
|
|
|
|
#define EPUBGREP_BOOK_HPP
|
|
|
|
|
|
|
|
#include "fs-compat.hpp"
|
|
|
|
|
|
|
|
#include <map>
|
2021-08-20 14:52:34 +02:00
|
|
|
#include <memory>
|
2021-08-17 13:05:14 +02:00
|
|
|
#include <string>
|
|
|
|
#include <string_view>
|
2021-08-17 14:22:28 +02:00
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2021-08-17 13:05:14 +02:00
|
|
|
|
|
|
|
namespace epubgrep::book
|
|
|
|
{
|
|
|
|
|
|
|
|
using std::string;
|
|
|
|
|
|
|
|
//! Document inside EPUB.
|
|
|
|
struct document
|
|
|
|
{
|
|
|
|
string text_raw; //!< HTML page
|
|
|
|
string text_cleaned; //!< Plain text page
|
2021-08-20 14:52:34 +02:00
|
|
|
std::unique_ptr<string> text; //!< Pointer to preferred text version
|
2021-08-17 13:05:14 +02:00
|
|
|
std::map<size_t, string> headlines; //!< pos, title
|
|
|
|
std::map<size_t, string> pages; //!< pos, page
|
|
|
|
string language; //!< Page language
|
|
|
|
} __attribute__((aligned(128)));
|
|
|
|
|
|
|
|
//! EPUB file.
|
|
|
|
struct book
|
|
|
|
{
|
2021-08-17 14:22:28 +02:00
|
|
|
std::vector<std::pair<string, document>> files; //!< filename, file
|
|
|
|
std::vector<std::pair<string, string>> toc; //!< title, href
|
|
|
|
string language; //!< Book language
|
2021-08-17 13:05:14 +02:00
|
|
|
} __attribute__((aligned(128)));
|
|
|
|
|
|
|
|
//! Read and process book.
|
|
|
|
[[nodiscard]] book read(fs::path filepath, bool raw);
|
|
|
|
|
|
|
|
//! Clean up page and record headlines and page numbers.
|
|
|
|
[[nodiscard]] document process_page(std::string_view text);
|
|
|
|
|
|
|
|
//! Return last headline if possible.
|
2021-08-20 17:07:25 +02:00
|
|
|
[[nodiscard]] string headline(const document &doc, size_t pos);
|
2021-08-17 13:05:14 +02:00
|
|
|
|
|
|
|
//! Return current page if possible.
|
2021-08-20 17:07:25 +02:00
|
|
|
[[nodiscard]] string page(const document &doc, size_t pos);
|
2021-08-17 13:05:14 +02:00
|
|
|
|
2021-08-20 15:35:10 +02:00
|
|
|
//! Returns the file path of the OPF file in the EPUB.
|
|
|
|
[[nodiscard]] fs::path get_opf_file_path(const fs::path &zipfile);
|
|
|
|
|
2021-08-20 15:29:55 +02:00
|
|
|
//! Returns the files in the EPUB “spine” (all pages that are actually text).
|
|
|
|
[[nodiscard]] std::vector<string> list_spine(const fs::path &filepath);
|
|
|
|
|
2021-08-17 13:05:14 +02:00
|
|
|
} // namespace epubgrep::book
|
|
|
|
|
|
|
|
#endif // EPUBGREP_BOOK_HPP
|