remwharead/src/lib/search.cpp

/*  This file is part of remwharead.
 *  Copyright © 2019 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <regex>
#include <algorithm>
#include <locale>
#include <list>
#include <thread>
#include <utility>
#include <iterator>
#include <Poco/UTF8String.h>
#include "search.hpp"

namespace remwharead
{
    using std::list;
    using std::regex;
    using std::regex_search;
    using std::smatch;
    using std::find;
    using std::find_if;
    using std::thread;
    using std::move;

    Search::Search(const list<Database::entry> &entries)
        :_entries(entries)
    {}

    const vector<vector<string>> Search::parse_expression(string expression)
        const
    {
        vector<vector<string>> searchlist;
        const regex re_or("(.+?) (OR|\\|\\|) ");
        const regex re_and("(.+?) (AND|&&) ");
        smatch match;

        vector<string> subexpressions;
        {                           // Split expression at OR.
            while (regex_search(expression, match, re_or))
            {
                subexpressions.push_back(match[1].str());
                expression = match.suffix().str();
            }
            subexpressions.push_back(expression);
        }

        {
            for (string sub : subexpressions)
            {                       // Split each OR-slice at AND.
                vector<string> terms;
                while (regex_search(sub, match, re_and))
                {
                    terms.push_back(to_lowercase(match[1].str()));
                    sub = match.suffix().str();
                }
                terms.push_back(to_lowercase(sub));
                searchlist.push_back(terms);
            }
        }

        return searchlist;
    }

    const string Search::to_lowercase(const string &str) const
    {
        return Poco::UTF8::toLower(str);
    }

    const list<DB::entry> Search::search_tags(string expression,
                                              const bool is_re) const
    {
        vector<vector<string>> searchlist = parse_expression(expression);
        list<DB::entry> result;

        for (const vector<string> &tags_or : searchlist)
        {
            for (const DB::entry &entry : _entries)
            {           // Add entry to result if all tags in an OR-slice match.
                bool matched = true;

                for (const string &tag : tags_or)
                {
                    const auto it = find_if(
                        entry.tags.begin(), entry.tags.end(),
                        [&, is_re](string s)
                        {
                            s = to_lowercase(s);
                            if (is_re)
                            {
                                const regex re("^" + tag + "$");
                                return regex_search(s, re);
                            }
                            else
                            {
                                return (s == tag);
                            }
                        });
                    if (it == entry.tags.end())
                    {
                        matched = false;
                    }
                }
                if (matched == true)
                {
                    result.push_back(entry);
                }
            }
        }

        return result;
    }

    const list<DB::entry> Search::search_all(string expression,
                                             const bool is_re) const
    {
        vector<vector<string>> searchlist = parse_expression(expression);
        list<DB::entry> result = search_tags(expression, is_re);

        for (const vector<string> &terms_or : searchlist)
        {
            for (const DB::entry &entry : _entries)
            {
                // Add entry to result if all terms in an OR-slice match title,
                // description or full text.
                bool matched_title = true;
                bool matched_description = true;
                bool matched_fulltext = true;

                const auto it = find(result.begin(), result.end(), entry);
                if (it != result.end())
                {                   // Skip if already in result list.
                    continue;
                }

                for (const string &term : terms_or)
                {
                    const string title = to_lowercase(entry.title);
                    const string description = to_lowercase(entry.description);
                    const string fulltext = to_lowercase(entry.fulltext);

                    // Set matched_* to false if term is not found.
                    if (is_re)
                    {
                        const regex re(term);

                        if(!regex_search(title, re))
                        {
                            matched_title = false;
                        }

                        if(!regex_search(description, re))
                        {
                            matched_description = false;
                        }

                        if(!regex_search(fulltext, re))
                        {
                            matched_fulltext = false;
                        }
                    }
                    else
                    {
                        if (title.find(term) == string::npos)
                        {
                            matched_title = false;
                        }

                        if (description.find(term) == string::npos)
                        {
                            matched_description = false;
                        }

                        if (fulltext.find(term) == string::npos)
                        {
                            matched_fulltext = false;
                        }
                    }
                }
                if (matched_title == true
                    || matched_description == true
                    || matched_fulltext == true)
                {
                    result.push_back(entry);
                }
            }
        }

        return result;
    }

    const list<Database::entry> Search::search_all_threaded(
        string expression, const bool is_re) const
    {
        list<Database::entry> entries = _entries;

        const size_t len = entries.size();
        constexpr size_t min_len = 100;
        constexpr size_t min_per_thread = 50;
        const size_t n_threads = thread::hardware_concurrency() / 3 + 1;
        size_t cut_at = len;
        if (len > min_len)
        {   // If there are over `min_len` entries, use `n_threads` threads.
            cut_at = len / n_threads;

            // But don't use less than `min_per_thread` entries per thread.
            if (cut_at < min_per_thread)
            {
                cut_at = min_per_thread;
            }
        }

        list<list<Database::entry>> segments;

        // Use threads if list is big.
        while (entries.size() > cut_at)
        {
            list<Database::entry> segment;

            auto it = entries.begin();
            std::advance(it, cut_at);

            // Move the first `cut_at` entries into `segments`.
            segment.splice(segment.begin(), entries, entries.begin(), it);
            segments.push_back(move(segment));
        }
        // Move rest of `entries` into `segments`.
        segments.push_back(move(entries));

        list<thread> threads;
        for (auto &segment : segments)
        {
            thread t(
                [&]
                {
                    Search search(segment);
                    // Replace `segment` with `result`.
                    segment = search.search_all(expression, is_re);
                });
            threads.push_back(move(t));
        }

        for (thread &t : threads)
        {
            t.join();
            // Move each of `segments` into `entries`.
            entries.splice(entries.end(), segments.front());
            segments.pop_front();
        }

        return entries;
    }
}
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`/* This file is part of remwharead.`
			`* Copyright © 2019 tastytea <tastytea@tastytea.de>`
			`*`
			`* This program is free software: you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation, version 3.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`*/`

			`#include <regex>`
			`#include <algorithm>`
Made searches case insensitive. 2019-05-22 13:30:42 +02:00			`#include <locale>`
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`#include <list>`
Moved threading into library. 2019-08-09 00:14:26 +02:00			`#include <thread>`
			`#include <utility>`
			`#include <iterator>`
Replaced ICU with POCO. 2019-08-06 11:39:51 +02:00			`#include <Poco/UTF8String.h>`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`#include "search.hpp"`

namespaced library. 2019-07-27 09:59:43 +02:00			`namespace remwharead`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`{`
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`using std::list;`
namespaced library. 2019-07-27 09:59:43 +02:00			`using std::regex;`
			`using std::regex_search;`
			`using std::smatch;`
			`using std::find;`
			`using std::find_if;`
Moved threading into library. 2019-08-09 00:14:26 +02:00			`using std::thread;`
			`using std::move;`
Added option to search for tags. 2019-05-19 09:42:52 +02:00
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`Search::Search(const list<Database::entry> &entries)`
Transformed the search functions into a class. Deleted unicode tests because they are covered by search tests. 2019-08-06 11:20:30 +02:00			`:_entries(entries)`
			`{}`

			`const vector<vector<string>> Search::parse_expression(string expression)`
			`const`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`vector<vector<string>> searchlist;`
			`const regex re_or("(.+?) (OR\|\\\|\\\|) ");`
			`const regex re_and("(.+?) (AND\|&&) ");`
			`smatch match;`

			`vector<string> subexpressions;`
			`{ // Split expression at OR.`
			`while (regex_search(expression, match, re_or))`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`subexpressions.push_back(match[1].str());`
			`expression = match.suffix().str();`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`}`
namespaced library. 2019-07-27 09:59:43 +02:00			`subexpressions.push_back(expression);`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`}`

namespaced library. 2019-07-27 09:59:43 +02:00			`{`
			`for (string sub : subexpressions)`
			`{ // Split each OR-slice at AND.`
			`vector<string> terms;`
			`while (regex_search(sub, match, re_and))`
			`{`
			`terms.push_back(to_lowercase(match[1].str()));`
			`sub = match.suffix().str();`
			`}`
			`terms.push_back(to_lowercase(sub));`
			`searchlist.push_back(terms);`
			`}`
			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00
namespaced library. 2019-07-27 09:59:43 +02:00			`return searchlist;`
			`}`
Made searches case insensitive. 2019-05-22 13:30:42 +02:00
Transformed the search functions into a class. Deleted unicode tests because they are covered by search tests. 2019-08-06 11:20:30 +02:00			`const string Search::to_lowercase(const string &str) const`
namespaced library. 2019-07-27 09:59:43 +02:00			`{`
Replaced ICU with POCO. 2019-08-06 11:39:51 +02:00			`return Poco::UTF8::toLower(str);`
namespaced library. 2019-07-27 09:59:43 +02:00			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`const list<DB::entry> Search::search_tags(string expression,`
			`const bool is_re) const`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`vector<vector<string>> searchlist = parse_expression(expression);`
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`list<DB::entry> result;`
Added support for regular expressions. 2019-07-25 03:38:26 +02:00
namespaced library. 2019-07-27 09:59:43 +02:00			`for (const vector<string> &tags_or : searchlist)`
			`{`
Transformed the search functions into a class. Deleted unicode tests because they are covered by search tests. 2019-08-06 11:20:30 +02:00			`for (const DB::entry &entry : _entries)`
namespaced library. 2019-07-27 09:59:43 +02:00			`{ // Add entry to result if all tags in an OR-slice match.`
			`bool matched = true;`

			`for (const string &tag : tags_or)`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`const auto it = find_if(`
			`entry.tags.begin(), entry.tags.end(),`
Transformed the search functions into a class. Deleted unicode tests because they are covered by search tests. 2019-08-06 11:20:30 +02:00			`[&, is_re](string s)`
namespaced library. 2019-07-27 09:59:43 +02:00			`{`
			`s = to_lowercase(s);`
			`if (is_re)`
			`{`
			`const regex re("^" + tag + "$");`
			`return regex_search(s, re);`
			`}`
			`else`
			`{`
			`return (s == tag);`
			`}`
			`});`
			`if (it == entry.tags.end())`
			`{`
			`matched = false;`
			`}`
			`}`
			`if (matched == true)`
			`{`
			`result.push_back(entry);`
Added option to search for tags. 2019-05-19 09:42:52 +02:00			`}`
			`}`
			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00
namespaced library. 2019-07-27 09:59:43 +02:00			`return result;`
			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`const list<DB::entry> Search::search_all(string expression,`
			`const bool is_re) const`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`vector<vector<string>> searchlist = parse_expression(expression);`
Changed vector<Database::entry> to list<Database::entry>. 2019-08-08 15:19:42 +02:00			`list<DB::entry> result = search_tags(expression, is_re);`
Ruled out duplicates when searching. 2019-05-19 12:47:38 +02:00
namespaced library. 2019-07-27 09:59:43 +02:00			`for (const vector<string> &terms_or : searchlist)`
			`{`
Transformed the search functions into a class. Deleted unicode tests because they are covered by search tests. 2019-08-06 11:20:30 +02:00			`for (const DB::entry &entry : _entries)`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`// Add entry to result if all terms in an OR-slice match title,`
			`// description or full text.`
			`bool matched_title = true;`
			`bool matched_description = true;`
			`bool matched_fulltext = true;`

			`const auto it = find(result.begin(), result.end(), entry);`
			`if (it != result.end())`
			`{ // Skip if already in result list.`
			`continue;`
			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00
namespaced library. 2019-07-27 09:59:43 +02:00			`for (const string &term : terms_or)`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`const string title = to_lowercase(entry.title);`
			`const string description = to_lowercase(entry.description);`
			`const string fulltext = to_lowercase(entry.fulltext);`
Added support for regular expressions. 2019-07-25 03:38:26 +02:00
namespaced library. 2019-07-27 09:59:43 +02:00			`// Set matched_* to false if term is not found.`
			`if (is_re)`
Added support for regular expressions. 2019-07-25 03:38:26 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`const regex re(term);`

			`if(!regex_search(title, re))`
			`{`
			`matched_title = false;`
			`}`

			`if(!regex_search(description, re))`
			`{`
			`matched_description = false;`
			`}`

			`if(!regex_search(fulltext, re))`
			`{`
			`matched_fulltext = false;`
			`}`
Added support for regular expressions. 2019-07-25 03:38:26 +02:00			`}`
namespaced library. 2019-07-27 09:59:43 +02:00			`else`
Added support for regular expressions. 2019-07-25 03:38:26 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`if (title.find(term) == string::npos)`
			`{`
			`matched_title = false;`
			`}`

			`if (description.find(term) == string::npos)`
			`{`
			`matched_description = false;`
			`}`

			`if (fulltext.find(term) == string::npos)`
			`{`
			`matched_fulltext = false;`
			`}`
Added support for regular expressions. 2019-07-25 03:38:26 +02:00			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`}`
namespaced library. 2019-07-27 09:59:43 +02:00			`if (matched_title == true`
			`\|\| matched_description == true`
			`\|\| matched_fulltext == true)`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`{`
namespaced library. 2019-07-27 09:59:43 +02:00			`result.push_back(entry);`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`}`
			`}`
			`}`

namespaced library. 2019-07-27 09:59:43 +02:00			`return result;`
			`}`
Moved threading into library. 2019-08-09 00:14:26 +02:00
			`const list<Database::entry> Search::search_all_threaded(`
			`string expression, const bool is_re) const`
			`{`
			`list<Database::entry> entries = _entries;`

			`const size_t len = entries.size();`
			`constexpr size_t min_len = 100;`
			`constexpr size_t min_per_thread = 50;`
			`const size_t n_threads = thread::hardware_concurrency() / 3 + 1;`
			`size_t cut_at = len;`
			`if (len > min_len)`
			{ // If there are over `min_len` entries, use `n_threads` threads.
			`cut_at = len / n_threads;`

			// But don't use less than `min_per_thread` entries per thread.
			`if (cut_at < min_per_thread)`
			`{`
			`cut_at = min_per_thread;`
			`}`
			`}`

			`list<list<Database::entry>> segments;`

			`// Use threads if list is big.`
			`while (entries.size() > cut_at)`
			`{`
			`list<Database::entry> segment;`

			`auto it = entries.begin();`
			`std::advance(it, cut_at);`

			// Move the first `cut_at` entries into `segments`.
			`segment.splice(segment.begin(), entries, entries.begin(), it);`
			`segments.push_back(move(segment));`
			`}`
			// Move rest of `entries` into `segments`.
			`segments.push_back(move(entries));`

			`list<thread> threads;`
			`for (auto &segment : segments)`
			`{`
			`thread t(`
			`[&]`
			`{`
			`Search search(segment);`
			// Replace `segment` with `result`.
			`segment = search.search_all(expression, is_re);`
			`});`
			`threads.push_back(move(t));`
			`}`

			`for (thread &t : threads)`
			`{`
			`t.join();`
			// Move each of `segments` into `entries`.
			`entries.splice(entries.end(), segments.front());`
			`segments.pop_front();`
			`}`

			`return entries;`
			`}`
Added option to search in tags, titles, descriptions and full texts. 2019-05-19 10:55:45 +02:00			`}`