Move book processing into own file.

2021-08-17 13:05:14 +02:00 · 2021-08-17 13:05:14 +02:00 · 84ef5d1bf3
commit 84ef5d1bf3
parent 97fecd37f0
5 changed files with 269 additions and 166 deletions
--- a/src/book.cpp
+++ b/src/book.cpp
@ -0,0 +1,182 @@
+/*  This file is part of epubgrep.
+ *  Copyright © 2021 tastytea <tastytea@tastytea.de>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Affero General Public License as published by
+ *  the Free Software Foundation, version 3.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Affero General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Affero General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "book.hpp"
+
+#include "fs-compat.hpp"
+#include "helpers.hpp"
+#include "log.hpp"
+#include "zip.hpp"
+
+#include <boost/regex.hpp>
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace epubgrep::book
+{
+
+using std::string;
+
+book read(const fs::path filepath, const bool raw)
+{
+    using helpers::unescape_html;
+
+    std::vector<string> epub_filepaths{[&filepath, raw]
+                                       {
+                                           if (!raw)
+                                           {
+                                               return zip::list_spine(filepath);
+                                           }
+                                           return zip::list(filepath);
+                                       }()};
+
+    book current_book;
+    for (const auto &entry : epub_filepaths)
+    {
+        DEBUGLOG << "Processing " << entry;
+        document doc;
+        if (!raw)
+        {
+            doc = process_page(unescape_html(zip::read_file(filepath, entry)));
+        }
+        else
+        {
+            doc.text_raw = zip::read_file(filepath, entry);
+        }
+        current_book.files.insert({entry, doc});
+    }
+
+    return current_book;
+}
+
+document process_page(const std::string_view text)
+{
+    string output{text};
+    static const boost::regex re_header_start{"<[hH][1-6]"};
+    static const boost::regex re_header_end{"</[hH][1-6]"};
+    static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
+                                           "(title|aria-label)"
+                                           "=\"([[:alnum:]]+)\""};
+
+    {
+        size_t pos{0};
+        while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos)
+        {
+            if (output[pos] == '\r')
+            {
+                output.erase(pos, 1);
+            }
+            else
+            {
+                output.replace(pos, 1, " ");
+            }
+        }
+    }
+    {
+        size_t pos{0};
+        while ((pos = output.find("  ", pos)) != string::npos)
+        {
+            output.replace(pos, 2, " ");
+        }
+    }
+
+    size_t pos{0};
+    document doc;
+    size_t headline_start{string::npos};
+    while ((pos = output.find('<', pos)) != string::npos)
+    {
+        auto endpos{output.find('>', pos) + 1};
+
+        if (boost::regex_match(output.substr(pos, 3), re_header_start))
+        {
+            headline_start = pos;
+        }
+        else if (boost::regex_match(output.substr(pos, 4), re_header_end))
+        {
+            if (headline_start != string::npos)
+            {
+                doc.headlines.insert(
+                    {headline_start,
+                     output.substr(headline_start, pos - headline_start)});
+                headline_start = string::npos;
+            }
+        }
+        else if (output.substr(pos, 6) == "<span ")
+        {
+            boost::match_results<string::const_iterator> match;
+            using it_size_t = string::const_iterator::difference_type;
+            string::const_iterator begin{output.begin()
+                                         + static_cast<it_size_t>(pos)};
+            string::const_iterator end{output.begin()
+                                       + static_cast<it_size_t>(endpos)};
+
+            if (boost::regex_search(begin, end, match, re_pagebreak))
+            {
+                doc.pages.insert({pos, match[2].str()});
+            }
+        }
+        else if (output.substr(pos, 7) == "<style "
+                 || output.substr(pos, 8) == "<script ")
+        {
+            if (output.find("/>", pos) > endpos)
+            {
+                endpos = output.find('>', endpos) + 1;
+            }
+        }
+
+        output.erase(pos, endpos - pos);
+    }
+
+    doc.text_cleaned = output;
+
+    return doc;
+}
+
+std::string headline(const document &doc, const size_t pos)
+{
+    std::string_view last;
+
+    for (const auto &pair : doc.headlines)
+    {
+        if (pair.first > pos)
+        {
+            break;
+        }
+        last = pair.second;
+    }
+
+    return string(last);
+}
+
+string page(const document &doc, const size_t pos)
+{
+    std::string_view last;
+
+    for (const auto &pair : doc.pages)
+    {
+        if (pair.first > pos)
+        {
+            break;
+        }
+        last = pair.second;
+    }
+
+    return string(last);
+}
+
+} // namespace epubgrep::book
--- a/src/book.hpp
+++ b/src/book.hpp
@ -0,0 +1,63 @@
+/*  This file is part of epubgrep.
+ *  Copyright © 2021 tastytea <tastytea@tastytea.de>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU Affero General Public License as published by
+ *  the Free Software Foundation, version 3.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Affero General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Affero General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef EPUBGREP_BOOK_HPP
+#define EPUBGREP_BOOK_HPP
+
+#include "fs-compat.hpp"
+
+#include <map>
+#include <string>
+#include <string_view>
+
+namespace epubgrep::book
+{
+
+using std::string;
+
+//! Document inside EPUB.
+struct document
+{
+    string text_raw;                    //!< HTML page
+    string text_cleaned;                //!< Plain text page
+    std::map<size_t, string> headlines; //!< pos, title
+    std::map<size_t, string> pages;     //!< pos, page
+    string language;                    //!< Page language
+} __attribute__((aligned(128)));
+
+//! EPUB file.
+struct book
+{
+    std::map<string, document> files; //!< filename, file
+    std::map<string, string> toc;     //!< title, href
+    string language;                  //!< Book language
+} __attribute__((aligned(128)));
+
+//! Read and process book.
+[[nodiscard]] book read(fs::path filepath, bool raw);
+
+//! Clean up page and record headlines and page numbers.
+[[nodiscard]] document process_page(std::string_view text);
+
+//! Return last headline if possible.
+[[nodiscard]] std::string headline(const document &doc, size_t pos);
+
+//! Return current page if possible.
+[[nodiscard]] std::string page(const document &doc, size_t pos);
+
+} // namespace epubgrep::book
+
+#endif // EPUBGREP_BOOK_HPP
--- a/src/search.cpp
+++ b/src/search.cpp
@ -16,6 +16,7 @@

 #include "search.hpp"

+#include "book.hpp"
 #include "fs-compat.hpp"
 #include "helpers.hpp"
 #include "log.hpp"
@ -73,33 +74,13 @@ std::vector<match> search(const fs::path &filepath,

    const boost::regex re(regex.data(), flags);
    std::vector<match> matches;
-    std::vector<string> epub_filepaths{[&opts, &filepath]
-                                       {
-                                           if (!opts.raw)
-                                           {
-                                               return zip::list_spine(filepath);
-                                           }
-                                           return zip::list(filepath);
-                                       }()};
-
-    for (const auto &entry : epub_filepaths)
+    auto book{book::read(filepath, opts.raw)};
+    for (const auto &file : book.files)
    {
-        DEBUGLOG << "Processing " << entry;
-        file_in_epub file;
-        {
-            const auto document{zip::read_file(filepath, entry)};
-            if (!opts.raw)
-            {
-                file = cleanup_text(helpers::unescape_html(document));
-            }
-            else
-            {
-                file.text = document;
-            }
-        }
-
-        string::const_iterator begin{file.text.begin()};
-        string::const_iterator end{file.text.end()};
+        const auto &doc{file.second};
+        const auto &text{doc.text_cleaned};
+        string::const_iterator begin{text.begin()};
+        string::const_iterator end{text.end()};
        auto begin_text{begin};
        boost::match_results<string::const_iterator> match_result;

@ -108,13 +89,13 @@ std::vector<match> search(const fs::path &filepath,
        {
            match match; // FIXME: Rename variable or struct.
            match.filepath_epub = filepath;
-            match.filepath_inside = entry;
+            match.filepath_inside = file.first;
            match.text = match_result[0];
            match.context = context(match_result, opts.context);
            const auto pos = static_cast<size_t>(
                std::distance(begin_text, match_result[0].begin()));
-            match.headline = headline(file, pos);
-            match.page = page(file, pos);
+            match.headline = headline(doc, pos);
+            match.page = page(doc, pos);

            matches.emplace_back(match);
            begin = match_result[0].end();
@ -124,89 +105,6 @@ std::vector<match> search(const fs::path &filepath,
    return matches;
 }

-file_in_epub cleanup_text(const std::string_view text)
-{
-    string output{text};
-    static const boost::regex re_header_start{"<[hH][1-6]"};
-    static const boost::regex re_header_end{"</[hH][1-6]"};
-    static const boost::regex re_pagebreak{"[^>]+pagebreak[^>]+"
-                                           "(title|aria-label)"
-                                           "=\"([[:alnum:]]+)\""};
-
-    {
-        size_t pos{0};
-        while ((pos = output.find_first_of("\n\t\r", pos)) != string::npos)
-        {
-            if (output[pos] == '\r')
-            {
-                output.erase(pos, 1);
-            }
-            else
-            {
-                output.replace(pos, 1, " ");
-            }
-        }
-    }
-    {
-        size_t pos{0};
-        while ((pos = output.find("  ", pos)) != string::npos)
-        {
-            output.replace(pos, 2, " ");
-        }
-    }
-
-    size_t pos{0};
-    file_in_epub file;
-    size_t headline_start{string::npos};
-    while ((pos = output.find('<', pos)) != string::npos)
-    {
-        auto endpos{output.find('>', pos) + 1};
-
-        if (boost::regex_match(output.substr(pos, 3), re_header_start))
-        {
-            headline_start = pos;
-        }
-        else if (boost::regex_match(output.substr(pos, 4), re_header_end))
-        {
-            if (headline_start != string::npos)
-            {
-                file.headlines.insert(
-                    {headline_start,
-                     output.substr(headline_start, pos - headline_start)});
-                headline_start = string::npos;
-            }
-        }
-        else if (output.substr(pos, 6) == "<span ")
-        {
-            boost::match_results<string::const_iterator> match;
-            using it_size_t = string::const_iterator::difference_type;
-            string::const_iterator begin{output.begin()
-                                         + static_cast<it_size_t>(pos)};
-            string::const_iterator end{output.begin()
-                                       + static_cast<it_size_t>(endpos)};
-
-            if (boost::regex_search(begin, end, match, re_pagebreak))
-            {
-                file.pages.insert({pos, match[2].str()});
-            }
-        }
-        else if (output.substr(pos, 7) == "<style "
-                 || output.substr(pos, 8) == "<script ")
-        {
-            if (output.find("/>", pos) > endpos)
-            {
-                endpos = output.find('>', endpos) + 1;
-            }
-        }
-
-        output.erase(pos, endpos - pos);
-    }
-
-    file.text = output;
-
-    return file;
-}
-
 match_context context(const boost::match_results<string::const_iterator> &match,
                      std::uint64_t words)
 {
@ -270,36 +168,4 @@ match_context context(const boost::match_results<string::const_iterator> &match,
    return {before, after};
 }

-std::string headline(const file_in_epub &file, const size_t pos)
-{
-    std::string_view last;
-
-    for (const auto &pair : file.headlines)
-    {
-        if (pair.first > pos)
-        {
-            break;
-        }
-        last = pair.second;
-    }
-
-    return string(last);
-}
-
-string page(const file_in_epub &file, const size_t pos)
-{
-    std::string_view last;
-
-    for (const auto &pair : file.pages)
-    {
-        if (pair.first > pos)
-        {
-            break;
-        }
-        last = pair.second;
-    }
-
-    return string(last);
-}
-
 } // namespace epubgrep::search
--- a/src/search.hpp
+++ b/src/search.hpp
@ -66,20 +66,11 @@ struct file_in_epub
                                        std::string_view regex,
                                        const settings &opts);

-//! Strip HTML, remove newlines, condense spaces.
-[[nodiscard]] file_in_epub cleanup_text(std::string_view text);
-
 //! Return words before and after the match.
 [[nodiscard]] match_context
 context(const boost::match_results<std::string::const_iterator> &match,
        std::uint64_t words);

-//! Return last headline if possible.
-[[nodiscard]] std::string headline(const file_in_epub &file, size_t pos);
-
-//! Return current page if possible.
-[[nodiscard]] std::string page(const file_in_epub &file, size_t pos);
-
 } // namespace epubgrep::search

 #endif // EPUBGREP_SEARCH_HPP
--- a/tests/test_search_helpers.cpp
+++ b/tests/test_search_helpers.cpp
@ -1,3 +1,4 @@
+#include "book.hpp"
 #include "fs-compat.hpp"
 #include "search.hpp"

@ -26,7 +27,7 @@ SCENARIO("Searching helpers work as intended")
                text = "Moss";
                try
                {
-                    text = epubgrep::search::cleanup_text(text).text;
+                    text = epubgrep::book::process_page(text).text_cleaned;
                }
                catch (const std::exception &)
                {
@ -46,7 +47,7 @@ SCENARIO("Searching helpers work as intended")
                text = "💖\r\r🦝";
                try
                {
-                    text = epubgrep::search::cleanup_text(text).text;
+                    text = epubgrep::book::process_page(text).text_cleaned;
                }
                catch (const std::exception &)
                {
@ -66,7 +67,7 @@ SCENARIO("Searching helpers work as intended")
                text = "Moss\n\n\n\n\n\nis good.";
                try
                {
-                    text = epubgrep::search::cleanup_text(text).text;
+                    text = epubgrep::book::process_page(text).text_cleaned;
                }
                catch (const std::exception &)
                {
@ -91,8 +92,8 @@ SCENARIO("Searching helpers work as intended")
                text = "… <h3>Soup</h3> …";
                try
                {
-                    auto file{epubgrep::search::cleanup_text(text)};
-                    text = epubgrep::search::headline(file, text.size());
+                    auto file{epubgrep::book::process_page(text)};
+                    text = epubgrep::book::headline(file, text.size());
                }
                catch (const std::exception &)
                {
@ -113,8 +114,8 @@ SCENARIO("Searching helpers work as intended")
                       "road to nowhere</h2> …";
                try
                {
-                    auto file{epubgrep::search::cleanup_text(text)};
-                    text = epubgrep::search::headline(file, text.size());
+                    auto file{epubgrep::book::process_page(text)};
+                    text = epubgrep::book::headline(file, text.size());
                }
                catch (const std::exception &)
                {
@ -134,8 +135,8 @@ SCENARIO("Searching helpers work as intended")
                text = "<html><hr>The long<section>road to nowhere</section>";
                try
                {
-                    auto file{epubgrep::search::cleanup_text(text)};
-                    text = epubgrep::search::headline(file, text.size());
+                    auto file{epubgrep::book::process_page(text)};
+                    text = epubgrep::book::headline(file, text.size());
                }
                catch (const std::exception &)
                {
@ -160,8 +161,8 @@ SCENARIO("Searching helpers work as intended")
                text = R"(… <span epub:type="pagebreak" … title="69"/> …)";
                try
                {
-                    auto file{epubgrep::search::cleanup_text(text)};
-                    text = epubgrep::search::page(file, text.size());
+                    auto file{epubgrep::book::process_page(text)};
+                    text = epubgrep::book::page(file, text.size());
                }
                catch (const std::exception &)
                {
@ -181,8 +182,8 @@ SCENARIO("Searching helpers work as intended")
                text = R"(… <span role="doc-pagebreak" … aria-label="69"/> …)";
                try
                {
-                    auto file{epubgrep::search::cleanup_text(text)};
-                    text = epubgrep::search::page(file, text.size());
+                    auto file{epubgrep::book::process_page(text)};
+                    text = epubgrep::book::page(file, text.size());
                }
                catch (const std::exception &)
                {