remwharead/src/lib/search.cpp

/*  This file is part of remwharead.
 *  Copyright © 2019 tastytea <tastytea@tastytea.de>
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, version 3.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "search.hpp"
#include <Poco/RegularExpression.h>
#include <Poco/UTF8String.h>
#include <algorithm>
#include <iterator>
#include <list>
#include <locale>
#include <thread>
#include <utility>

namespace remwharead
{
using std::list;
using std::find;
using std::find_if;
using std::thread;
using std::move;
using RegEx = Poco::RegularExpression;

Search::Search(list<Database::entry> entries)
    :_entries(move(entries))
{}

vector<vector<string>> Search::parse_expression(const string &expression)
{
    vector<vector<string>> searchlist;
    const RegEx re_or("(.+?) (OR|\\|\\|) ");
    const RegEx re_and("(.+?) (AND|&&) ");
    RegEx::MatchVec matches;
    string::size_type pos = 0;

    vector<string> subexpressions;
    {                           // Split expression at OR.
        while (re_or.match(expression, pos, matches) != 0)
        {
            const string &subexpr = expression.substr(matches[1].offset,
                                                      matches[1].length);
            subexpressions.push_back(subexpr);
            pos = matches[0].offset + matches[0].length;
        }
        subexpressions.push_back(expression.substr(pos));
    }

    {
        for (const string &sub : subexpressions)
        {                       // Split each OR-slice at AND.
            vector<string> terms;
            pos = 0;

            while (re_and.match(sub, pos, matches) != 0)
            {
                const string &term = sub.substr(matches[1].offset,
                                                matches[1].length);
                terms.push_back(to_lowercase(term));
                pos = matches[0].offset + matches[0].length;
            }
            terms.push_back(to_lowercase(sub.substr(pos)));
            searchlist.push_back(terms);
        }
    }

    return searchlist;
}

string Search::to_lowercase(const string &str)
{
    return Poco::UTF8::toLower(str);
}

list<Database::entry> Search::search_tags(const string &expression,
                                          const bool is_re) const
{
    vector<vector<string>> searchlist = parse_expression(expression);
    list<Database::entry> result;

    for (const vector<string> &tags_or : searchlist)
    {
        for (const Database::entry &entry : _entries)
        {               // Add entry to result if all tags in an OR-slice match.
            bool matched = true;

            for (const string &tag : tags_or)
            {
                const auto it = find_if(
                    entry.tags.begin(), entry.tags.end(),
                    [&, is_re](string s)
                    {
                        s = to_lowercase(s);
                        if (is_re)
                        {
                            const RegEx re("^" + tag + "$");
                            return (re == s);
                        }

                        return (s == tag);
                    });
                if (it == entry.tags.end())
                {
                    matched = false;
                }
            }
            if (matched)
            {
                result.push_back(entry);
            }
        }
    }

    return result;
}

list<Database::entry> Search::search_all(const string &expression,
                                         const bool is_re) const
{
    vector<vector<string>> searchlist = parse_expression(expression);
    list<Database::entry> result = search_tags(expression, is_re);

    for (const vector<string> &terms_or : searchlist)
    {
        for (const Database::entry &entry : _entries)
        {
            // Add entry to result if all terms in an OR-slice match title,
            // description or full text.
            bool matched_title = true;
            bool matched_description = true;
            bool matched_fulltext = true;

            const auto it = find(result.begin(), result.end(), entry);
            if (it != result.end())
            {                   // Skip if already in result list.
                continue;
            }

            for (const string &term : terms_or)
            {
                const string title = to_lowercase(entry.title);
                const string description = to_lowercase(entry.description);
                const string fulltext = to_lowercase(entry.fulltext);

                // Set matched_* to false if term is not found.
                if (is_re)
                {
                    const RegEx re(term);

                    if (!(re == title))
                    {
                        matched_title = false;
                    }

                    if (!(re == description))
                    {
                        matched_description = false;
                    }

                    if (!(re == fulltext))
                    {
                        matched_fulltext = false;
                    }
                }
                else
                {
                    if (title.find(term) == string::npos)
                    {
                        matched_title = false;
                    }

                    if (description.find(term) == string::npos)
                    {
                        matched_description = false;
                    }

                    if (fulltext.find(term) == string::npos)
                    {
                        matched_fulltext = false;
                    }
                }
            }
            if (matched_title || matched_description || matched_fulltext)
            {
                result.push_back(entry);
            }
        }
    }

    return result;
}

list<Database::entry> Search::search_all_threaded(const string &expression,
                                                  const bool is_re) const
{
    list<Database::entry> entries = _entries;

    const size_t len = entries.size();
    constexpr size_t min_len = 100;
    constexpr size_t min_per_thread = 50;
    const size_t n_threads = thread::hardware_concurrency() / 3 + 1;
    size_t cut_at = len;
    if (len > min_len)
    {   // If there are over `min_len` entries, use `n_threads` threads.
        cut_at = len / n_threads;

        // But don't use less than `min_per_thread` entries per thread.
        if (cut_at < min_per_thread)
        {
            cut_at = min_per_thread;
        }
    }

    list<list<Database::entry>> segments;

    // Use threads if list is big.
    while (entries.size() > cut_at)
    {
        list<Database::entry> segment;

        auto it = entries.begin();
        std::advance(it, cut_at);

        // Move the first `cut_at` entries into `segments`.
        segment.splice(segment.begin(), entries, entries.begin(), it);
        segments.push_back(move(segment));
    }
    // Move rest of `entries` into `segments`.
    list<Database::entry> rest;
    rest.splice(rest.begin(), entries);
    segments.push_back(move(rest));

    list<thread> threads;
    for (auto &segment : segments)
    {
        thread t(
            [&]
            {
                Search search(segment);
                // Replace `segment` with `result`.
                segment = search.search_all(expression, is_re);
            });
        threads.push_back(move(t));
    }

    for (thread &t : threads)
    {
        t.join();
        // Move each of `segments` into `entries`.
        entries.splice(entries.end(), segments.front());
        segments.pop_front();
    }

    return entries;

}
} // namespace remwharead