From 00e3edb9f20249cc3608ca10b71e39d1ecc76297 Mon Sep 17 00:00:00 2001 From: tastytea Date: Sat, 29 May 2021 15:50:03 +0200 Subject: [PATCH] Only search files in spine, in the right order. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The spine lists all content documents in their linear reading order. So we're finally getting our results in the right order! πŸŽ‰ Since we skip the images and fonts, which usually make up the most bytes in an EPUB file, the performance increase is immense. I measured 60-70% in a very short test. Closes: https://schlomp.space/tastytea/epubgrep/issues/1 --- .drone.yml | 10 +- CMakeLists.txt | 1 + README.adoc | 6 +- man/epubgrep.1.adoc | 5 +- src/CMakeLists.txt | 3 +- src/search.cpp | 11 ++- src/zip.cpp | 99 ++++++++++++++++++- src/zip.hpp | 6 ++ .../{test_search.cpp => test_search_zip.cpp} | 30 +++--- 9 files changed, 147 insertions(+), 24 deletions(-) rename tests/{test_search.cpp => test_search_zip.cpp} (79%) diff --git a/.drone.yml b/.drone.yml index 3493d03..6d84804 100644 --- a/.drone.yml +++ b/.drone.yml @@ -26,7 +26,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq build-essential cmake clang locales - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DWITH_TESTS=YES .. - make VERBOSE=1 @@ -63,7 +63,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq g++-8 build-essential clang locales - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DWITH_TESTS=YES .. @@ -120,7 +120,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq build-essential cmake clang locales lsb-release - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - make VERBOSE=1 @@ -144,7 +144,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq build-essential cmake clang locales lsb-release - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - make VERBOSE=1 @@ -176,7 +176,7 @@ steps: - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - apt-get update -q - apt-get install -qq g++-8 build-essential clang locales lsb-release - - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc + - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local - rm -rf build && mkdir -p build && cd build - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. diff --git a/CMakeLists.txt b/CMakeLists.txt index 867e09e..79fa19e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ if(NOT termcolor_FOUND) endif() endif() find_package(Threads REQUIRED) +find_package(pugixml 1 REQUIRED CONFIG) add_subdirectory(src) diff --git a/README.adoc b/README.adoc index e5fb7ff..daaceb7 100644 --- a/README.adoc +++ b/README.adoc @@ -15,6 +15,7 @@ :uri-fmt: https://github.com/fmtlib/fmt :uri-asciidoc: http://asciidoc.org/ :uri-termcolor: https://termcolor.readthedocs.io/ +:uri-pugixml: https://pugixml.org/ :license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE :license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE @@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install * link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6) * link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled version is used.) +* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8) * Optional ** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10) @@ -95,7 +97,9 @@ of CMake. [source,shell] -------------------------------------------------------------------------------- -sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc +sudo apt install build-essential cmake libboost-program-options-dev \ + libboost-locale-dev libboost-regex-dev gettext libarchive-dev \ + libfmt-dev asciidoc libpugixml-dev -------------------------------------------------------------------------------- ==== Get sourcecode diff --git a/man/epubgrep.1.adoc b/man/epubgrep.1.adoc index 6fafdba..c07fb31 100644 --- a/man/epubgrep.1.adoc +++ b/man/epubgrep.1.adoc @@ -2,7 +2,7 @@ :doctype: manpage :Author: tastytea :Email: tastytea@tastytea.de -:Date: 2021-05-28 +:Date: 2021-05-29 :Revision: 0.0.0 :man source: epubgrep :man manual: General Commands Manual @@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data. Use additional _PATTERN_ for matching. Can be used more than once. *-a*, *--raw*:: -Do not clean up text before searching. No HTML stripping, no newline removal. +Do not clean up text before searching. No HTML stripping, no newline removal, +all files will be read (not just the text documents listed in the spine). *-C* _NUMBER_, *context* _NUMBER_:: Print _NUMBER_ words of context around matches. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index df6a810..1bdff15 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib fmt::fmt termcolor::termcolor Threads::Threads - m) + m + pugixml) if(${CMAKE_VERSION} VERSION_LESS 3.17) target_link_libraries(${PROJECT_NAME}_lib diff --git a/src/search.cpp b/src/search.cpp index 050bfbc..aa6d078 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -62,7 +62,16 @@ std::vector search(const fs::path &filepath, const boost::regex re(regex.data(), flags); std::vector matches; - for (const auto &entry : zip::list(filepath)) + std::vector epub_filepaths{[&opts, &filepath] + { + if (!opts.raw) + { + return zip::list_spine(filepath); + } + return zip::list(filepath); + }()}; + + for (const auto &entry : epub_filepaths) { auto document{zip::read_file(filepath, entry)}; if (!opts.raw) diff --git a/src/zip.cpp b/src/zip.cpp index 67d9b70..d997cda 100644 --- a/src/zip.cpp +++ b/src/zip.cpp @@ -23,6 +23,7 @@ #include #include #include // For compatibility with fmt 4. +#include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include #include namespace epubgrep::zip @@ -50,12 +52,12 @@ std::vector list(const fs::path &filepath) if (in_epub_filepath == nullptr) { // If the encoding is broken, we skip the file. std::cerr << translate("WARNING: ") - << format(translate("{0:s} is damaged. " - "Skipping rest of file.\n") + << format(translate("File in {0:s} is damaged. " + "Skipping in-EPUB file.\n") .str() .data(), filepath); - break; + continue; } toc.emplace_back(in_epub_filepath); archive_read_data_skip(zipfile); @@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path) while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK) { const auto *path{archive_entry_pathname_utf8(entry)}; + if (path == nullptr) + { // If the encoding is broken, we skip the file. + std::cerr << translate("WARNING: ") + << format(translate("File in {0:s} is damaged. " + "Skipping in-EPUB file.\n") + .str() + .data(), + filepath); + continue; + } if (std::strcmp(path, entry_path.data()) == 0) { const auto length{static_cast(archive_entry_size(entry))}; @@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath) } } +std::vector list_spine(const fs::path &filepath) +{ + const fs::path opf_file_path{ + [&filepath] + { + pugi::xml_document xml; + const std::string container{ + read_file(filepath, "META-INF/container.xml")}; + const auto result{xml.load_buffer(&container[0], container.size())}; + if (result) + { + return xml.child("container") + .child("rootfiles") + .first_child() + .attribute("full-path") + .value(); + } + + return ""; + }()}; + + std::vector spine_filepaths; + if (!opf_file_path.empty()) + { + pugi::xml_document xml; + const std::string opf_file{read_file(filepath, opf_file_path.string())}; + const auto result{xml.load_buffer(&opf_file[0], opf_file.size())}; + if (result) + { + auto manifest{xml.child("package").child("manifest")}; + auto spine{xml.child("package").child("spine")}; + + for (const auto &itemref : spine) + { + const auto &idref{itemref.attribute("idref").value()}; + const auto &item{manifest.find_child_by_attribute("id", idref)}; + const std::string href{ + urldecode(item.attribute("href").value())}; + if (href[0] != '/') + { + spine_filepaths.emplace_back( + opf_file_path.parent_path() /= href); + continue; + } + spine_filepaths.emplace_back(href); + } + } + } + + if (opf_file_path.empty() || spine_filepaths.empty()) + { + std::cerr << translate("ERROR: ") + << format(translate("{0:s} is damaged. Could not read spine. " + "Skipping file.\n") + .str() + .data(), + filepath); + return {}; + } + + return spine_filepaths; +} + +std::string urldecode(const std::string_view url) +{ // RFC 3986, section 2.1. + size_t pos{0}; + size_t lastpos{0}; + std::string decoded; + while ((pos = url.find('%', pos)) != std::string_view::npos) + { + decoded += url.substr(lastpos, pos - lastpos); + decoded += static_cast( + std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16)); + pos += 3; + lastpos = pos; + } + decoded += url.substr(lastpos); + + return decoded; +} + } // namespace epubgrep::zip diff --git a/src/zip.hpp b/src/zip.hpp index e8719a5..c0de8f1 100644 --- a/src/zip.hpp +++ b/src/zip.hpp @@ -43,6 +43,12 @@ namespace epubgrep::zip //! Close zip file. void close_file(struct archive *zipfile, const fs::path &filepath); +//! Returns the files in the EPUB β€œspine” (all pages that are actually text). +[[nodiscard]] std::vector list_spine(const fs::path &filepath); + +//! Decode percent-encoding. Used for restricted characters in URLs. +[[nodiscard]] std::string urldecode(std::string_view url); + //! It's std::runtime_error, but with another name. class exception : public std::runtime_error { diff --git a/tests/test_search.cpp b/tests/test_search_zip.cpp similarity index 79% rename from tests/test_search.cpp rename to tests/test_search_zip.cpp index d4859d9..b5da7af 100644 --- a/tests/test_search.cpp +++ b/tests/test_search_zip.cpp @@ -23,6 +23,7 @@ SCENARIO("Searching works") { std::vector matches; epubgrep::search::settings opts; + opts.raw = true; WHEN("We search for β€˜πŸ“™+\\w?’ using extended regular expressions") { @@ -63,17 +64,19 @@ SCENARIO("Searching works") REQUIRE_FALSE(exception); REQUIRE(matches.at(0).filepath == "test folder/😊"); REQUIRE(matches.at(0).text == "πŸ“—"); - REQUIRE(matches.at(0).context.first == "πŸ“– πŸ“˜"); - REQUIRE(matches.at(0).context.second == "πŸ“™ "); + REQUIRE(matches.at(0).context.first == "πŸ“–\n\nπŸ“˜"); + REQUIRE(matches.at(0).context.second == "πŸ“™\n"); } } - WHEN("We search for β€˜ ’ (space) with context = 1.") + WHEN("We search for β€˜[ \\n]’ with context = 1.") { try { opts.context = 1; - matches = epubgrep::search::search(zipfile, " ", opts); + opts.regex = epubgrep::options::regex_kind::perl; + matches = epubgrep::search::search(zipfile, R"([ \n])", + opts); } catch (const std::exception &) { @@ -83,19 +86,24 @@ SCENARIO("Searching works") THEN("No exception is thrown") AND_THEN("It returns the match correctly") { + // I looked at this a week or so after I've written it, and + // I have come to the realization that this is a tiny bit + // more complicated than strictly required. πŸ˜„ + // TODO: Rewrite test.zip and tests to be better + // understandable. REQUIRE_FALSE(exception); REQUIRE(matches.at(1).filepath == "test folder/test file"); REQUIRE(matches.at(1).text == " "); REQUIRE(matches.at(1).context.first == "don't"); REQUIRE(matches.at(1).context.second == "want to"); REQUIRE(matches.at(10).filepath == "test folder/😊"); - REQUIRE(matches.at(10).text == " "); + REQUIRE(matches.at(10).text == "\n"); REQUIRE(matches.at(10).context.first == "πŸ“–"); - REQUIRE(matches.at(10).context.second == "πŸ“˜πŸ“—πŸ“™ "); - REQUIRE(matches.at(11).filepath == "test folder/😊"); - REQUIRE(matches.at(11).text == " "); - REQUIRE(matches.at(11).context.first == "πŸ“˜πŸ“—πŸ“™"); - REQUIRE(matches.at(11).context.second == ""); + REQUIRE(matches.at(10).context.second == "\nπŸ“˜πŸ“—πŸ“™\n"); + REQUIRE(matches.at(12).filepath == "test folder/😊"); + REQUIRE(matches.at(12).text == "\n"); + REQUIRE(matches.at(12).context.first == "πŸ“˜πŸ“—πŸ“™"); + REQUIRE(matches.at(12).context.second.empty()); } } @@ -119,7 +127,7 @@ SCENARIO("Searching works") { REQUIRE_FALSE(exception); REQUIRE(matches.at(0).filepath == "test folder/test file"); - REQUIRE(matches.at(0).text == "work today. I'm stay"); + REQUIRE(matches.at(0).text == "work today.\nI'm stay"); REQUIRE(matches.at(0).context.first == "to "); REQUIRE(matches.at(0).context.second == "ing in"); }