Only search files in spine, in the right order.

The spine lists all content documents in their linear reading order. So we're
finally getting our results in the right order! 🎉

Since we skip the images and fonts, which usually make up the most bytes in an
EPUB file, the performance increase is immense. I measured 60-70% in a very
short test.

Closes: #1
This commit is contained in:
tastytea 2021-05-29 15:50:03 +02:00
parent c94d9de0db
commit 00e3edb9f2
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
9 changed files with 147 additions and 24 deletions

View File

@ -26,7 +26,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qq build-essential cmake clang locales - apt-get install -qq build-essential cmake clang locales
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DWITH_TESTS=YES .. - cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
- make VERBOSE=1 - make VERBOSE=1
@ -63,7 +63,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qq g++-8 build-essential clang locales - apt-get install -qq g++-8 build-essential clang locales
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DWITH_TESTS=YES .. - cmake -G "Unix Makefiles" -DWITH_TESTS=YES ..
@ -120,7 +120,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qq build-essential cmake clang locales lsb-release - apt-get install -qq build-essential cmake clang locales lsb-release
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
- make VERBOSE=1 - make VERBOSE=1
@ -144,7 +144,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qq build-essential cmake clang locales lsb-release - apt-get install -qq build-essential cmake clang locales lsb-release
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..
- make VERBOSE=1 - make VERBOSE=1
@ -176,7 +176,7 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qq g++-8 build-essential clang locales lsb-release - apt-get install -qq g++-8 build-essential clang locales lsb-release
- apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc - apt-get install -qq catch libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc libpugixml-dev
- sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local - sh cmake_installer.sh --skip-license --exclude-subdir --prefix=/usr/local
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr .. - cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=/usr ..

View File

@ -39,6 +39,7 @@ if(NOT termcolor_FOUND)
endif() endif()
endif() endif()
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
find_package(pugixml 1 REQUIRED CONFIG)
add_subdirectory(src) add_subdirectory(src)

View File

@ -15,6 +15,7 @@
:uri-fmt: https://github.com/fmtlib/fmt :uri-fmt: https://github.com/fmtlib/fmt
:uri-asciidoc: http://asciidoc.org/ :uri-asciidoc: http://asciidoc.org/
:uri-termcolor: https://termcolor.readthedocs.io/ :uri-termcolor: https://termcolor.readthedocs.io/
:uri-pugixml: https://pugixml.org/
:license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE :license: https://schlomp.space/tastytea/{project}/src/branch/main/LICENSE
:license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE :license-termcolor: https://schlomp.space/tastytea/{project}/src/branch/main/dist/termcolor/LICENSE
@ -84,6 +85,7 @@ If you get the error message that `add-apt-repository` was not found, install
* link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6) * link:{uri-asciidoc}[AsciiDoc] (tested: 9.0 / 8.6)
* link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled * link:{uri-termcolor}[Termcolor] (tested: 2.0) (If not found, the bundled
version is used.) version is used.)
* link:{uri-pugixml}[pugixml] (tested: 1.11 / 1.8)
* Optional * Optional
** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10) ** Tests: link:{uri-catch}[Catch] (tested: 2.13 / 1.10)
@ -95,7 +97,9 @@ of CMake.
[source,shell] [source,shell]
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
sudo apt install build-essential cmake libboost-program-options-dev libboost-locale-dev libboost-regex-dev gettext libarchive-dev libfmt-dev asciidoc sudo apt install build-essential cmake libboost-program-options-dev \
libboost-locale-dev libboost-regex-dev gettext libarchive-dev \
libfmt-dev asciidoc libpugixml-dev
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
==== Get sourcecode ==== Get sourcecode

View File

@ -2,7 +2,7 @@
:doctype: manpage :doctype: manpage
:Author: tastytea :Author: tastytea
:Email: tastytea@tastytea.de :Email: tastytea@tastytea.de
:Date: 2021-05-28 :Date: 2021-05-29
:Revision: 0.0.0 :Revision: 0.0.0
:man source: epubgrep :man source: epubgrep
:man manual: General Commands Manual :man manual: General Commands Manual
@ -50,7 +50,8 @@ Ignore case distinctions in pattern and data.
Use additional _PATTERN_ for matching. Can be used more than once. Use additional _PATTERN_ for matching. Can be used more than once.
*-a*, *--raw*:: *-a*, *--raw*::
Do not clean up text before searching. No HTML stripping, no newline removal. Do not clean up text before searching. No HTML stripping, no newline removal,
all files will be read (not just the text documents listed in the spine).
*-C* _NUMBER_, *context* _NUMBER_:: *-C* _NUMBER_, *context* _NUMBER_::
Print _NUMBER_ words of context around matches. Print _NUMBER_ words of context around matches.

View File

@ -24,7 +24,8 @@ target_link_libraries(${PROJECT_NAME}_lib
fmt::fmt fmt::fmt
termcolor::termcolor termcolor::termcolor
Threads::Threads Threads::Threads
m) m
pugixml)
if(${CMAKE_VERSION} VERSION_LESS 3.17) if(${CMAKE_VERSION} VERSION_LESS 3.17)
target_link_libraries(${PROJECT_NAME}_lib target_link_libraries(${PROJECT_NAME}_lib

View File

@ -62,7 +62,16 @@ std::vector<match> search(const fs::path &filepath,
const boost::regex re(regex.data(), flags); const boost::regex re(regex.data(), flags);
std::vector<match> matches; std::vector<match> matches;
for (const auto &entry : zip::list(filepath)) std::vector<string> epub_filepaths{[&opts, &filepath]
{
if (!opts.raw)
{
return zip::list_spine(filepath);
}
return zip::list(filepath);
}()};
for (const auto &entry : epub_filepaths)
{ {
auto document{zip::read_file(filepath, entry)}; auto document{zip::read_file(filepath, entry)};
if (!opts.raw) if (!opts.raw)

View File

@ -23,6 +23,7 @@
#include <boost/locale/message.hpp> #include <boost/locale/message.hpp>
#include <fmt/format.h> #include <fmt/format.h>
#include <fmt/ostream.h> // For compatibility with fmt 4. #include <fmt/ostream.h> // For compatibility with fmt 4.
#include <pugixml.hpp>
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
@ -30,6 +31,7 @@
#include <iostream> #include <iostream>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <string_view>
#include <vector> #include <vector>
namespace epubgrep::zip namespace epubgrep::zip
@ -50,12 +52,12 @@ std::vector<std::string> list(const fs::path &filepath)
if (in_epub_filepath == nullptr) if (in_epub_filepath == nullptr)
{ // If the encoding is broken, we skip the file. { // If the encoding is broken, we skip the file.
std::cerr << translate("WARNING: ") std::cerr << translate("WARNING: ")
<< format(translate("{0:s} is damaged. " << format(translate("File in {0:s} is damaged. "
"Skipping rest of file.\n") "Skipping in-EPUB file.\n")
.str() .str()
.data(), .data(),
filepath); filepath);
break; continue;
} }
toc.emplace_back(in_epub_filepath); toc.emplace_back(in_epub_filepath);
archive_read_data_skip(zipfile); archive_read_data_skip(zipfile);
@ -74,6 +76,16 @@ std::string read_file(const fs::path &filepath, std::string_view entry_path)
while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK) while (archive_read_next_header(zipfile, &entry) == ARCHIVE_OK)
{ {
const auto *path{archive_entry_pathname_utf8(entry)}; const auto *path{archive_entry_pathname_utf8(entry)};
if (path == nullptr)
{ // If the encoding is broken, we skip the file.
std::cerr << translate("WARNING: ")
<< format(translate("File in {0:s} is damaged. "
"Skipping in-EPUB file.\n")
.str()
.data(),
filepath);
continue;
}
if (std::strcmp(path, entry_path.data()) == 0) if (std::strcmp(path, entry_path.data()) == 0)
{ {
const auto length{static_cast<size_t>(archive_entry_size(entry))}; const auto length{static_cast<size_t>(archive_entry_size(entry))};
@ -140,4 +152,85 @@ void close_file(struct archive *zipfile, const fs::path &filepath)
} }
} }
std::vector<std::string> list_spine(const fs::path &filepath)
{
const fs::path opf_file_path{
[&filepath]
{
pugi::xml_document xml;
const std::string container{
read_file(filepath, "META-INF/container.xml")};
const auto result{xml.load_buffer(&container[0], container.size())};
if (result)
{
return xml.child("container")
.child("rootfiles")
.first_child()
.attribute("full-path")
.value();
}
return "";
}()};
std::vector<std::string> spine_filepaths;
if (!opf_file_path.empty())
{
pugi::xml_document xml;
const std::string opf_file{read_file(filepath, opf_file_path.string())};
const auto result{xml.load_buffer(&opf_file[0], opf_file.size())};
if (result)
{
auto manifest{xml.child("package").child("manifest")};
auto spine{xml.child("package").child("spine")};
for (const auto &itemref : spine)
{
const auto &idref{itemref.attribute("idref").value()};
const auto &item{manifest.find_child_by_attribute("id", idref)};
const std::string href{
urldecode(item.attribute("href").value())};
if (href[0] != '/')
{
spine_filepaths.emplace_back(
opf_file_path.parent_path() /= href);
continue;
}
spine_filepaths.emplace_back(href);
}
}
}
if (opf_file_path.empty() || spine_filepaths.empty())
{
std::cerr << translate("ERROR: ")
<< format(translate("{0:s} is damaged. Could not read spine. "
"Skipping file.\n")
.str()
.data(),
filepath);
return {};
}
return spine_filepaths;
}
std::string urldecode(const std::string_view url)
{ // RFC 3986, section 2.1.
size_t pos{0};
size_t lastpos{0};
std::string decoded;
while ((pos = url.find('%', pos)) != std::string_view::npos)
{
decoded += url.substr(lastpos, pos - lastpos);
decoded += static_cast<char>(
std::stoul(std::string(url.substr(pos + 1, 2)), nullptr, 16));
pos += 3;
lastpos = pos;
}
decoded += url.substr(lastpos);
return decoded;
}
} // namespace epubgrep::zip } // namespace epubgrep::zip

View File

@ -43,6 +43,12 @@ namespace epubgrep::zip
//! Close zip file. //! Close zip file.
void close_file(struct archive *zipfile, const fs::path &filepath); void close_file(struct archive *zipfile, const fs::path &filepath);
//! Returns the files in the EPUB “spine” (all pages that are actually text).
[[nodiscard]] std::vector<std::string> list_spine(const fs::path &filepath);
//! Decode percent-encoding. Used for restricted characters in URLs.
[[nodiscard]] std::string urldecode(std::string_view url);
//! It's std::runtime_error, but with another name. //! It's std::runtime_error, but with another name.
class exception : public std::runtime_error class exception : public std::runtime_error
{ {

View File

@ -23,6 +23,7 @@ SCENARIO("Searching works")
{ {
std::vector<epubgrep::search::match> matches; std::vector<epubgrep::search::match> matches;
epubgrep::search::settings opts; epubgrep::search::settings opts;
opts.raw = true;
WHEN("We search for ‘📙+\\w? using extended regular expressions") WHEN("We search for ‘📙+\\w? using extended regular expressions")
{ {
@ -63,17 +64,19 @@ SCENARIO("Searching works")
REQUIRE_FALSE(exception); REQUIRE_FALSE(exception);
REQUIRE(matches.at(0).filepath == "test folder/😊"); REQUIRE(matches.at(0).filepath == "test folder/😊");
REQUIRE(matches.at(0).text == "📗"); REQUIRE(matches.at(0).text == "📗");
REQUIRE(matches.at(0).context.first == "📖 📘"); REQUIRE(matches.at(0).context.first == "📖\n\n📘");
REQUIRE(matches.at(0).context.second == "📙 "); REQUIRE(matches.at(0).context.second == "📙\n");
} }
} }
WHEN("We search for (space) with context = 1.") WHEN("We search for [ \\n] with context = 1.")
{ {
try try
{ {
opts.context = 1; opts.context = 1;
matches = epubgrep::search::search(zipfile, " ", opts); opts.regex = epubgrep::options::regex_kind::perl;
matches = epubgrep::search::search(zipfile, R"([ \n])",
opts);
} }
catch (const std::exception &) catch (const std::exception &)
{ {
@ -83,19 +86,24 @@ SCENARIO("Searching works")
THEN("No exception is thrown") THEN("No exception is thrown")
AND_THEN("It returns the match correctly") AND_THEN("It returns the match correctly")
{ {
// I looked at this a week or so after I've written it, and
// I have come to the realization that this is a tiny bit
// more complicated than strictly required. 😄
// TODO: Rewrite test.zip and tests to be better
// understandable.
REQUIRE_FALSE(exception); REQUIRE_FALSE(exception);
REQUIRE(matches.at(1).filepath == "test folder/test file"); REQUIRE(matches.at(1).filepath == "test folder/test file");
REQUIRE(matches.at(1).text == " "); REQUIRE(matches.at(1).text == " ");
REQUIRE(matches.at(1).context.first == "don't"); REQUIRE(matches.at(1).context.first == "don't");
REQUIRE(matches.at(1).context.second == "want to"); REQUIRE(matches.at(1).context.second == "want to");
REQUIRE(matches.at(10).filepath == "test folder/😊"); REQUIRE(matches.at(10).filepath == "test folder/😊");
REQUIRE(matches.at(10).text == " "); REQUIRE(matches.at(10).text == "\n");
REQUIRE(matches.at(10).context.first == "📖"); REQUIRE(matches.at(10).context.first == "📖");
REQUIRE(matches.at(10).context.second == "📘📗📙 "); REQUIRE(matches.at(10).context.second == "\n📘📗📙\n");
REQUIRE(matches.at(11).filepath == "test folder/😊"); REQUIRE(matches.at(12).filepath == "test folder/😊");
REQUIRE(matches.at(11).text == " "); REQUIRE(matches.at(12).text == "\n");
REQUIRE(matches.at(11).context.first == "📘📗📙"); REQUIRE(matches.at(12).context.first == "📘📗📙");
REQUIRE(matches.at(11).context.second == ""); REQUIRE(matches.at(12).context.second.empty());
} }
} }
@ -119,7 +127,7 @@ SCENARIO("Searching works")
{ {
REQUIRE_FALSE(exception); REQUIRE_FALSE(exception);
REQUIRE(matches.at(0).filepath == "test folder/test file"); REQUIRE(matches.at(0).filepath == "test folder/test file");
REQUIRE(matches.at(0).text == "work today. I'm stay"); REQUIRE(matches.at(0).text == "work today.\nI'm stay");
REQUIRE(matches.at(0).context.first == "to "); REQUIRE(matches.at(0).context.first == "to ");
REQUIRE(matches.at(0).context.second == "ing in"); REQUIRE(matches.at(0).context.second == "ing in");
} }