2019-05-19 09:42:52 +02:00
|
|
|
/* This file is part of remwharead.
|
|
|
|
* Copyright © 2019 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <regex>
|
|
|
|
#include <algorithm>
|
2019-05-22 13:30:42 +02:00
|
|
|
#include <locale>
|
2019-08-08 15:19:42 +02:00
|
|
|
#include <list>
|
2019-08-09 00:14:26 +02:00
|
|
|
#include <thread>
|
|
|
|
#include <utility>
|
|
|
|
#include <iterator>
|
2019-08-06 11:39:51 +02:00
|
|
|
#include <Poco/UTF8String.h>
|
2019-05-19 09:42:52 +02:00
|
|
|
#include "search.hpp"
|
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
namespace remwharead
|
2019-05-19 09:42:52 +02:00
|
|
|
{
|
2019-08-08 15:19:42 +02:00
|
|
|
using std::list;
|
2019-07-27 09:59:43 +02:00
|
|
|
using std::regex;
|
|
|
|
using std::regex_search;
|
|
|
|
using std::smatch;
|
|
|
|
using std::find;
|
|
|
|
using std::find_if;
|
2019-08-09 00:14:26 +02:00
|
|
|
using std::thread;
|
|
|
|
using std::move;
|
2019-05-19 09:42:52 +02:00
|
|
|
|
2019-08-08 15:19:42 +02:00
|
|
|
Search::Search(const list<Database::entry> &entries)
|
2019-08-06 11:20:30 +02:00
|
|
|
:_entries(entries)
|
|
|
|
{}
|
|
|
|
|
|
|
|
const vector<vector<string>> Search::parse_expression(string expression)
|
|
|
|
const
|
2019-05-19 09:42:52 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
vector<vector<string>> searchlist;
|
|
|
|
const regex re_or("(.+?) (OR|\\|\\|) ");
|
|
|
|
const regex re_and("(.+?) (AND|&&) ");
|
|
|
|
smatch match;
|
|
|
|
|
|
|
|
vector<string> subexpressions;
|
|
|
|
{ // Split expression at OR.
|
|
|
|
while (regex_search(expression, match, re_or))
|
2019-05-19 09:42:52 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
subexpressions.push_back(match[1].str());
|
|
|
|
expression = match.suffix().str();
|
2019-05-19 09:42:52 +02:00
|
|
|
}
|
2019-07-27 09:59:43 +02:00
|
|
|
subexpressions.push_back(expression);
|
2019-05-19 09:42:52 +02:00
|
|
|
}
|
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
|
|
|
for (string sub : subexpressions)
|
|
|
|
{ // Split each OR-slice at AND.
|
|
|
|
vector<string> terms;
|
|
|
|
while (regex_search(sub, match, re_and))
|
|
|
|
{
|
|
|
|
terms.push_back(to_lowercase(match[1].str()));
|
|
|
|
sub = match.suffix().str();
|
|
|
|
}
|
|
|
|
terms.push_back(to_lowercase(sub));
|
|
|
|
searchlist.push_back(terms);
|
|
|
|
}
|
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
return searchlist;
|
|
|
|
}
|
2019-05-22 13:30:42 +02:00
|
|
|
|
2019-08-06 11:20:30 +02:00
|
|
|
const string Search::to_lowercase(const string &str) const
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
2019-08-06 11:39:51 +02:00
|
|
|
return Poco::UTF8::toLower(str);
|
2019-07-27 09:59:43 +02:00
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
|
2019-08-08 15:19:42 +02:00
|
|
|
const list<DB::entry> Search::search_tags(string expression,
|
|
|
|
const bool is_re) const
|
2019-05-19 09:42:52 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
vector<vector<string>> searchlist = parse_expression(expression);
|
2019-08-08 15:19:42 +02:00
|
|
|
list<DB::entry> result;
|
2019-07-25 03:38:26 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
for (const vector<string> &tags_or : searchlist)
|
|
|
|
{
|
2019-08-06 11:20:30 +02:00
|
|
|
for (const DB::entry &entry : _entries)
|
2019-07-27 09:59:43 +02:00
|
|
|
{ // Add entry to result if all tags in an OR-slice match.
|
|
|
|
bool matched = true;
|
|
|
|
|
|
|
|
for (const string &tag : tags_or)
|
2019-05-19 09:42:52 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
const auto it = find_if(
|
|
|
|
entry.tags.begin(), entry.tags.end(),
|
2019-08-06 11:20:30 +02:00
|
|
|
[&, is_re](string s)
|
2019-07-27 09:59:43 +02:00
|
|
|
{
|
|
|
|
s = to_lowercase(s);
|
|
|
|
if (is_re)
|
|
|
|
{
|
|
|
|
const regex re("^" + tag + "$");
|
|
|
|
return regex_search(s, re);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
return (s == tag);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
if (it == entry.tags.end())
|
|
|
|
{
|
|
|
|
matched = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (matched == true)
|
|
|
|
{
|
|
|
|
result.push_back(entry);
|
2019-05-19 09:42:52 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
return result;
|
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
|
2019-08-08 15:19:42 +02:00
|
|
|
const list<DB::entry> Search::search_all(string expression,
|
|
|
|
const bool is_re) const
|
2019-05-19 10:55:45 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
vector<vector<string>> searchlist = parse_expression(expression);
|
2019-08-08 15:19:42 +02:00
|
|
|
list<DB::entry> result = search_tags(expression, is_re);
|
2019-05-19 12:47:38 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
for (const vector<string> &terms_or : searchlist)
|
|
|
|
{
|
2019-08-06 11:20:30 +02:00
|
|
|
for (const DB::entry &entry : _entries)
|
2019-05-19 10:55:45 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
// Add entry to result if all terms in an OR-slice match title,
|
|
|
|
// description or full text.
|
|
|
|
bool matched_title = true;
|
|
|
|
bool matched_description = true;
|
|
|
|
bool matched_fulltext = true;
|
|
|
|
|
|
|
|
const auto it = find(result.begin(), result.end(), entry);
|
|
|
|
if (it != result.end())
|
|
|
|
{ // Skip if already in result list.
|
|
|
|
continue;
|
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
for (const string &term : terms_or)
|
2019-05-19 10:55:45 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
const string title = to_lowercase(entry.title);
|
|
|
|
const string description = to_lowercase(entry.description);
|
|
|
|
const string fulltext = to_lowercase(entry.fulltext);
|
2019-07-25 03:38:26 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
// Set matched_* to false if term is not found.
|
|
|
|
if (is_re)
|
2019-07-25 03:38:26 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
const regex re(term);
|
|
|
|
|
|
|
|
if(!regex_search(title, re))
|
|
|
|
{
|
|
|
|
matched_title = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!regex_search(description, re))
|
|
|
|
{
|
|
|
|
matched_description = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(!regex_search(fulltext, re))
|
|
|
|
{
|
|
|
|
matched_fulltext = false;
|
|
|
|
}
|
2019-07-25 03:38:26 +02:00
|
|
|
}
|
2019-07-27 09:59:43 +02:00
|
|
|
else
|
2019-07-25 03:38:26 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
if (title.find(term) == string::npos)
|
|
|
|
{
|
|
|
|
matched_title = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (description.find(term) == string::npos)
|
|
|
|
{
|
|
|
|
matched_description = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fulltext.find(term) == string::npos)
|
|
|
|
{
|
|
|
|
matched_fulltext = false;
|
|
|
|
}
|
2019-07-25 03:38:26 +02:00
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
}
|
2019-07-27 09:59:43 +02:00
|
|
|
if (matched_title == true
|
|
|
|
|| matched_description == true
|
|
|
|
|| matched_fulltext == true)
|
2019-05-19 10:55:45 +02:00
|
|
|
{
|
2019-07-27 09:59:43 +02:00
|
|
|
result.push_back(entry);
|
2019-05-19 10:55:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
return result;
|
|
|
|
}
|
2019-08-09 00:14:26 +02:00
|
|
|
|
|
|
|
const list<Database::entry> Search::search_all_threaded(
|
|
|
|
string expression, const bool is_re) const
|
|
|
|
{
|
|
|
|
list<Database::entry> entries = _entries;
|
|
|
|
|
|
|
|
const size_t len = entries.size();
|
|
|
|
constexpr size_t min_len = 100;
|
|
|
|
constexpr size_t min_per_thread = 50;
|
|
|
|
const size_t n_threads = thread::hardware_concurrency() / 3 + 1;
|
|
|
|
size_t cut_at = len;
|
|
|
|
if (len > min_len)
|
|
|
|
{ // If there are over `min_len` entries, use `n_threads` threads.
|
|
|
|
cut_at = len / n_threads;
|
|
|
|
|
|
|
|
// But don't use less than `min_per_thread` entries per thread.
|
|
|
|
if (cut_at < min_per_thread)
|
|
|
|
{
|
|
|
|
cut_at = min_per_thread;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
list<list<Database::entry>> segments;
|
|
|
|
|
|
|
|
// Use threads if list is big.
|
|
|
|
while (entries.size() > cut_at)
|
|
|
|
{
|
|
|
|
list<Database::entry> segment;
|
|
|
|
|
|
|
|
auto it = entries.begin();
|
|
|
|
std::advance(it, cut_at);
|
|
|
|
|
|
|
|
// Move the first `cut_at` entries into `segments`.
|
|
|
|
segment.splice(segment.begin(), entries, entries.begin(), it);
|
|
|
|
segments.push_back(move(segment));
|
|
|
|
}
|
|
|
|
// Move rest of `entries` into `segments`.
|
|
|
|
segments.push_back(move(entries));
|
|
|
|
|
|
|
|
list<thread> threads;
|
|
|
|
for (auto &segment : segments)
|
|
|
|
{
|
|
|
|
thread t(
|
|
|
|
[&]
|
|
|
|
{
|
|
|
|
Search search(segment);
|
|
|
|
// Replace `segment` with `result`.
|
|
|
|
segment = search.search_all(expression, is_re);
|
|
|
|
});
|
|
|
|
threads.push_back(move(t));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (thread &t : threads)
|
|
|
|
{
|
|
|
|
t.join();
|
|
|
|
// Move each of `segments` into `entries`.
|
|
|
|
entries.splice(entries.end(), segments.front());
|
|
|
|
segments.pop_front();
|
|
|
|
}
|
|
|
|
|
|
|
|
return entries;
|
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
}
|