/* This file is part of epubgrep. * Copyright © 2021 tastytea * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, version 3. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see . */ #include "book.hpp" #include "fs-compat.hpp" #include "helpers.hpp" #include "log.hpp" #include "zip.hpp" #include #include #include #include // For compatibility with fmt 4. #include #include #include #include #include #include namespace epubgrep::book { using boost::locale::translate; using fmt::format; using std::string; book read(const fs::path filepath, const bool raw) { using helpers::unescape_html; DEBUGLOG << "Processing book " << filepath; std::vector epub_filepaths{[&filepath, raw] { if (!raw) { return list_spine(filepath); } return zip::list(filepath); }()}; book current_book; current_book.language = [&filepath]() -> string { try { pugi::xml_document xml; auto opf_file_path{get_opf_file_path(filepath)}; const std::string opf_file{ zip::read_file(filepath, opf_file_path.string())}; const auto result{xml.load_buffer(&opf_file[0], opf_file.size())}; if (result) { auto lang{xml.child("package") .child("metadata") .child("dc:language")}; if (lang == nullptr) { lang = xml.child("opf:package") .child("opf:metadata") .child("dc:language"); } return lang.text().as_string(); } } catch (epubgrep::zip::exception &e) { if (e.code != 1) // 1 == container.xml not found. { LOG(log::sev::error) << e.what(); } } return ""; }(); DEBUGLOG << "Book language detected: " << current_book.language; for (const auto &entry : epub_filepaths) { DEBUGLOG << "Processing document " << entry; document doc; if (!raw) { doc = process_page(unescape_html(zip::read_file(filepath, entry))); } else { doc.text_raw = zip::read_file(filepath, entry); doc.text = std::make_unique(doc.text_raw); } doc.language = current_book.language; // FIXME: Get language of doc. current_book.files.emplace_back(entry, std::move(doc)); } return current_book; } document process_page(const std::string_view text) { string output{text}; static const boost::regex re_header_start{"<[hH][1-6]"}; static const boost::regex re_header_end{"]+pagebreak[^>]+" "(title|aria-label)" "=\"([[:alnum:]]+)\""}; { size_t pos{0}; while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos) { if (output[pos] == '\r') { output.erase(pos, 1); } else { output.replace(pos, 1, " "); } } } { size_t pos{0}; while ((pos = output.find(" ", pos)) != string::npos) { output.replace(pos, 2, " "); } } size_t pos{0}; document doc; size_t headline_start{string::npos}; while ((pos = output.find('<', pos)) != string::npos) { auto endpos{output.find('>', pos) + 1}; if (boost::regex_match(output.substr(pos, 3), re_header_start)) { headline_start = pos; } else if (boost::regex_match(output.substr(pos, 4), re_header_end)) { if (headline_start != string::npos) { doc.headlines.insert( {headline_start, output.substr(headline_start, pos - headline_start)}); headline_start = string::npos; } } else if (output.substr(pos, 6) == " match; using it_size_t = string::const_iterator::difference_type; string::const_iterator begin{output.begin() + static_cast(pos)}; string::const_iterator end{output.begin() + static_cast(endpos)}; if (boost::regex_search(begin, end, match, re_pagebreak)) { doc.pages.insert({pos, match[2].str()}); } } else if (output.substr(pos, 7) == "