diff --git a/src/book.cpp b/src/book.cpp new file mode 100644 index 0000000..a7175d6 --- /dev/null +++ b/src/book.cpp @@ -0,0 +1,182 @@ +/* This file is part of epubgrep. + * Copyright © 2021 tastytea + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, version 3. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "book.hpp" + +#include "fs-compat.hpp" +#include "helpers.hpp" +#include "log.hpp" +#include "zip.hpp" + +#include + +#include +#include +#include + +namespace epubgrep::book +{ + +using std::string; + +book read(const fs::path filepath, const bool raw) +{ + using helpers::unescape_html; + + std::vector epub_filepaths{[&filepath, raw] + { + if (!raw) + { + return zip::list_spine(filepath); + } + return zip::list(filepath); + }()}; + + book current_book; + for (const auto &entry : epub_filepaths) + { + DEBUGLOG << "Processing " << entry; + document doc; + if (!raw) + { + doc = process_page(unescape_html(zip::read_file(filepath, entry))); + } + else + { + doc.text_raw = zip::read_file(filepath, entry); + } + current_book.files.insert({entry, doc}); + } + + return current_book; +} + +document process_page(const std::string_view text) +{ + string output{text}; + static const boost::regex re_header_start{"<[hH][1-6]"}; + static const boost::regex re_header_end{"]+pagebreak[^>]+" + "(title|aria-label)" + "=\"([[:alnum:]]+)\""}; + + { + size_t pos{0}; + while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos) + { + if (output[pos] == '\r') + { + output.erase(pos, 1); + } + else + { + output.replace(pos, 1, " "); + } + } + } + { + size_t pos{0}; + while ((pos = output.find(" ", pos)) != string::npos) + { + output.replace(pos, 2, " "); + } + } + + size_t pos{0}; + document doc; + size_t headline_start{string::npos}; + while ((pos = output.find('<', pos)) != string::npos) + { + auto endpos{output.find('>', pos) + 1}; + + if (boost::regex_match(output.substr(pos, 3), re_header_start)) + { + headline_start = pos; + } + else if (boost::regex_match(output.substr(pos, 4), re_header_end)) + { + if (headline_start != string::npos) + { + doc.headlines.insert( + {headline_start, + output.substr(headline_start, pos - headline_start)}); + headline_start = string::npos; + } + } + else if (output.substr(pos, 6) == " match; + using it_size_t = string::const_iterator::difference_type; + string::const_iterator begin{output.begin() + + static_cast(pos)}; + string::const_iterator end{output.begin() + + static_cast(endpos)}; + + if (boost::regex_search(begin, end, match, re_pagebreak)) + { + doc.pages.insert({pos, match[2].str()}); + } + } + else if (output.substr(pos, 7) == "