diff --git a/.drone.yml b/.drone.yml index 3493d03..6d84804 100644 --- a/.drone.yml +++ b/.drone.yml @@ -26,7 +26,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq build-essential cmake clang locales - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DWITH_TESTS=YES .. - make VERBOSE=1 @@ -63,7 +63,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq g++-8 build-essential clang locales - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DWITH_TESTS=YES .. @@ -120,7 +120,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq build-essential cmake clang locales lsb-release - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - make VERBOSE=1 @@ -144,7 +144,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq build-essential cmake clang locales lsb-release - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - make VERBOSE=1 @@ -176,7 +176,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq g++-8 build-essential clang locales lsb-release - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. diff --git a/CMakeLists.txt b/CMakeLists.txt index 867e09e..79fa19e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ if(NOT termcolor_FOUND) endif() endif() find_package(Threads REQUIRED) +find_package(pugixml 1 REQUIRED CONFIG) add_subdirectory(src) diff --git a/README.adoc b/README.adoc index e5fb7ff..daaceb7 100644 --- a/README.adoc +++ b/README.adoc @@ -15,6 +15,7 @@ :uri-fmt: https://github.com/fmtlib/fmt :uri-asciidoc: http://asciidoc.org/ :uri-termcolor: https://termcolor.readthedocs.io/ +:uri-pugixml: https://pugixml.org/ :license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE :license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE @@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install * link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6) * link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled version is used.) +* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8) * Optional ** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10) @@ -95,7 +97,9 @@ of CMake. [source,shell] -------------------------------------------------------------------------------- -sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc +sudo apt install build-essential cmake libboost-program-options-dev \ + libboost-locale-dev libboost-regex-dev gettext libarchive-dev \ + libfmt-dev asciidoc libpugixml-dev -------------------------------------------------------------------------------- ==== Get sourcecode diff --git a/man/epubgrep.1.adoc b/man/epubgrep.1.adoc index 6fafdba..c07fb31 100644 --- a/man/epubgrep.1.adoc +++ b/man/epubgrep.1.adoc @@ -2,7 +2,7 @@ :doctype: manpage :Author: tastytea :Email: tastytea@tastytea.de -:Date: 2021-05-28 +:Date: 2021-05-29 :Revision: 0.0.0 :man source: epubgrep :man manual: General Commands Manual @@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data. Use additional _PATTERN_ for matching. Can be used more than once. *-a*, *--raw*:: -Do not clean up text before searching. No HTML stripping, no newline removal. +Do not clean up text before searching. No HTML stripping, no newline removal, +all files will be read (not just the text documents listed in the spine). *-C* _NUMBER_, *context* _NUMBER_:: Print _NUMBER_ words of context around matches. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index df6a810..1bdff15 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib fmt::fmt termcolor::termcolor Threads::Threads - m) + m + pugixml) if(${CMAKE_VERSION} VERSION_LESS 3.17) target_link_libraries(${PROJECT_NAME}_lib diff --git a/src/search.cpp b/src/search.cpp index 050bfbc..aa6d078 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -62,7 +62,16 @@ std::vector search(const fs::path &filepath, const boost::regex re(regex.data(), flags); std::vector matches; - for (const auto &entry : zip::list(filepath)) + std::vector epub_filepaths{[&opts, &filepath] + { + if (!opts.raw) + { + return zip::list_spine(filepath); + } + return zip::list(filepath); + }()}; + + for (const auto &entry : epub_filepaths) { auto document{zip::read_file(filepath, entry)}; if (!opts.raw) diff --git a/src/zip.cpp b/src/zip.cpp index 67d9b70..d997cda 100644 --- a/src/zip.cpp +++ b/src/zip.cpp @@ -23,6 +23,7 @@ #include #include #include // For compatibility with fmt 4. +#include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include namespace epubgrep::zip @@ -50,12 +52,12 @@ std::vector list(const fs::path &filepath) if (in_epub_filepath == nullptr) { // If the encoding is broken, we skip the file. std::cerr << translate("WARNING: ") - << format(translate("{0:s} is damaged. " - "Skipping rest of file.\n") + << format(translate("File in {0:s} is damaged. " + "Skipping in-EPUB file.\n") .str() .data(), filepath); - break; + continue; } toc.emplace_back(in_epub_filepath); archive_read_data_skip(zipfile); @@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path) while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK) { const auto *path{archive_entry_pathname_utf8(entry)}; + if (path == nullptr) + { // If the encoding is broken, we skip the file. + std::cerr << translate("WARNING: ") + << format(translate("File in {0:s} is damaged. " + "Skipping in-EPUB file.\n") + .str() + .data(), + filepath); + continue; + } if (std::strcmp(path, entry_path.data()) == 0) { const auto length{static_cast(archive_entry_size(entry))}; @@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath) } } +std::vector list_spine(const fs::path &filepath) +{ + const fs::path opf_file_path{ + [&filepath] + { + pugi::xml_document xml; + const std::string container{ + read_file(filepath, "META-INF/container.xml")}; + const auto result{xml.load_buffer(&container[0], container.size())}; + if (result) + { + return xml.child("container") + .child("rootfiles") + .first_child() + .attribute("full-path") + .value(); + } + + return ""; + }()}; + + std::vector spine_filepaths; + if (!opf_file_path.empty()) + { + pugi::xml_document xml; + const std::string opf_file{read_file(filepath, opf_file_path.string())}; + const auto result{xml.load_buffer(&opf_file[0], opf_file.size())}; + if (result) + { + auto manifest{xml.child("package").child("manifest")}; + auto spine{xml.child("package").child("spine")}; + + for (const auto &itemref : spine) + { + const auto &idref{itemref.attribute("idref").value()}; + const auto &item{manifest.find_child_by_attribute("id", idref)}; + const std::string href{ + urldecode(item.attribute("href").value())}; + if (href[0] != '/') + { + spine_filepaths.emplace_back( + opf_file_path.parent_path() /= href); + continue; + } + spine_filepaths.emplace_back(href); + } + } + } + + if (opf_file_path.empty() || spine_filepaths.empty()) + { + std::cerr << translate("ERROR: ") + << format(translate("{0:s} is damaged. Could not read spine. " + "Skipping file.\n") + .str() + .data(), + filepath); + return {}; + } + + return spine_filepaths; +} + +std::string urldecode(const std::string_view url) +{ // RFC 3986, section 2.1. + size_t pos{0}; + size_t lastpos{0}; + std::string decoded; + while ((pos = url.find('%', pos)) != std::string_view::npos) + { + decoded += url.substr(lastpos, pos - lastpos); + decoded += static_cast( + std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16)); + pos += 3; + lastpos = pos; + } + decoded += url.substr(lastpos); + + return decoded; +} + } // namespace epubgrep::zip diff --git a/src/zip.hpp b/src/zip.hpp index e8719a5..c0de8f1 100644 --- a/src/zip.hpp +++ b/src/zip.hpp @@ -43,6 +43,12 @@ namespace epubgrep::zip //! Close zip file. void close_file(struct archive *zipfile, const fs::path &filepath); +//! Returns the files in the EPUB β€œspine” (all pages that are actually text). +[[nodiscard]] std::vector list_spine(const fs::path &filepath); + +//! Decode percent-encoding. Used for restricted characters in URLs. +[[nodiscard]] std::string urldecode(std::string_view url); + //! It's std::runtime_error, but with another name. class exception : public std::runtime_error { diff --git a/tests/test_search.cpp b/tests/test_search_zip.cpp similarity index 79% rename from tests/test_search.cpp rename to tests/test_search_zip.cpp index d4859d9..b5da7af 100644 --- a/tests/test_search.cpp +++ b/tests/test_search_zip.cpp @@ -23,6 +23,7 @@ SCENARIO("Searching works") { std::vector matches; epubgrep::search::settings opts; + opts.raw = true; WHEN("We search for β€˜πŸ“™+\\w?’ using extended regular expressions") { @@ -63,17 +64,19 @@ SCENARIO("Searching works") REQUIRE_FALSE(exception); REQUIRE(matches.at(0).filepath == "test folder/😊"); REQUIRE(matches.at(0).text == "πŸ“—"); - REQUIRE(matches.at(0).context.first == "πŸ“– πŸ“˜"); - REQUIRE(matches.at(0).context.second == "πŸ“™ "); + REQUIRE(matches.at(0).context.first == "πŸ“–\n\nπŸ“˜"); + REQUIRE(matches.at(0).context.second == "πŸ“™\n"); } } - WHEN("We search for β€˜ ’ (space) with context = 1.") + WHEN("We search for β€˜[ \\n]’ with context = 1.") { try { opts.context = 1; - matches = epubgrep::search::search(zipfile, " ", opts); + opts.regex = epubgrep::options::regex_kind::perl; + matches = epubgrep::search::search(zipfile, R"([ \n])", + opts); } catch (const std::exception &) { @@ -83,19 +86,24 @@ SCENARIO("Searching works") THEN("No exception is thrown") AND_THEN("It returns the match correctly") { + // I looked at this a week or so after I've written it, and + // I have come to the realization that this is a tiny bit + // more complicated than strictly required. πŸ˜„ + // TODO: Rewrite test.zip and tests to be better + // understandable. REQUIRE_FALSE(exception); REQUIRE(matches.at(1).filepath == "test folder/test file"); REQUIRE(matches.at(1).text == " "); REQUIRE(matches.at(1).context.first == "don't"); REQUIRE(matches.at(1).context.second == "want to"); REQUIRE(matches.at(10).filepath == "test folder/😊"); - REQUIRE(matches.at(10).text == " "); + REQUIRE(matches.at(10).text == "\n"); REQUIRE(matches.at(10).context.first == "πŸ“–"); - REQUIRE(matches.at(10).context.second == "πŸ“˜πŸ“—πŸ“™ "); - REQUIRE(matches.at(11).filepath == "test folder/😊"); - REQUIRE(matches.at(11).text == " "); - REQUIRE(matches.at(11).context.first == "πŸ“˜πŸ“—πŸ“™"); - REQUIRE(matches.at(11).context.second == ""); + REQUIRE(matches.at(10).context.second == "\nπŸ“˜πŸ“—πŸ“™\n"); + REQUIRE(matches.at(12).filepath == "test folder/😊"); + REQUIRE(matches.at(12).text == "\n"); + REQUIRE(matches.at(12).context.first == "πŸ“˜πŸ“—πŸ“™"); + REQUIRE(matches.at(12).context.second.empty()); } } @@ -119,7 +127,7 @@ SCENARIO("Searching works") { REQUIRE_FALSE(exception); REQUIRE(matches.at(0).filepath == "test folder/test file"); - REQUIRE(matches.at(0).text == "work today. I'm stay"); + REQUIRE(matches.at(0).text == "work today.\nI'm stay"); REQUIRE(matches.at(0).context.first == "to "); REQUIRE(matches.at(0).context.second == "ing in"); }