2019-05-11 02:52:33 +02:00
|
|
|
/* This file is part of remwharead.
|
|
|
|
* Copyright © 2019 tastytea <tastytea@tastytea.de>
|
|
|
|
*
|
|
|
|
* This program is free software: you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
* the Free Software Foundation, version 3.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2019-05-12 00:02:31 +02:00
|
|
|
#include <iostream>
|
|
|
|
#include <string>
|
2019-05-14 22:57:51 +02:00
|
|
|
#include <chrono>
|
2019-05-16 08:37:50 +02:00
|
|
|
#include <fstream>
|
2019-05-22 12:25:06 +02:00
|
|
|
#include <locale>
|
2019-08-08 15:36:04 +02:00
|
|
|
#include <thread>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <iterator>
|
2019-08-08 15:19:42 +02:00
|
|
|
#include <list>
|
2019-08-08 20:37:38 +02:00
|
|
|
#include <cerrno>
|
2019-05-12 20:19:00 +02:00
|
|
|
#include "sqlite.hpp"
|
2019-08-06 17:14:32 +02:00
|
|
|
#include "remwharead_cli.hpp"
|
2019-05-16 08:36:35 +02:00
|
|
|
#include "uri.hpp"
|
2019-05-26 15:47:06 +02:00
|
|
|
#include "types.hpp"
|
2019-07-26 07:12:50 +02:00
|
|
|
#include "export/csv.hpp"
|
|
|
|
#include "export/adoc.hpp"
|
|
|
|
#include "export/bookmarks.hpp"
|
|
|
|
#include "export/simple.hpp"
|
2019-05-19 09:42:52 +02:00
|
|
|
#include "search.hpp"
|
2019-05-12 00:02:31 +02:00
|
|
|
|
2019-07-27 09:59:43 +02:00
|
|
|
using namespace remwharead;
|
2019-05-12 00:02:31 +02:00
|
|
|
using std::cerr;
|
|
|
|
using std::endl;
|
|
|
|
using std::string;
|
2019-05-14 22:57:51 +02:00
|
|
|
using std::chrono::system_clock;
|
2019-08-06 17:14:32 +02:00
|
|
|
using std::ofstream;
|
2019-08-08 15:36:04 +02:00
|
|
|
using std::thread;
|
|
|
|
using std::move;
|
2019-08-08 15:19:42 +02:00
|
|
|
using std::list;
|
2019-05-12 00:02:31 +02:00
|
|
|
|
2019-08-06 17:14:32 +02:00
|
|
|
int App::main(const std::vector<std::string> &args)
|
2019-05-11 02:52:33 +02:00
|
|
|
{
|
2019-05-22 12:25:06 +02:00
|
|
|
std::locale::global(std::locale("")); // Set locale globally.
|
|
|
|
|
2019-08-06 17:14:32 +02:00
|
|
|
if (_version_requested)
|
2019-05-12 20:19:00 +02:00
|
|
|
{
|
2019-08-06 17:14:32 +02:00
|
|
|
print_version();
|
|
|
|
}
|
|
|
|
else if (_help_requested)
|
|
|
|
{
|
|
|
|
print_help();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (_argument_error)
|
|
|
|
{
|
2019-08-08 20:37:38 +02:00
|
|
|
return EINVAL;
|
2019-08-06 17:14:32 +02:00
|
|
|
}
|
|
|
|
if (args.size() > 0)
|
|
|
|
{
|
|
|
|
_uri = args[0];
|
|
|
|
}
|
|
|
|
if (_uri.empty() && _format == export_format::undefined)
|
|
|
|
{
|
2019-08-06 19:02:22 +02:00
|
|
|
cerr << "Error: You have to specify either an URI or --export.\n";
|
2019-08-08 20:37:38 +02:00
|
|
|
return EINVAL;
|
2019-08-06 17:14:32 +02:00
|
|
|
}
|
2019-05-12 20:19:00 +02:00
|
|
|
}
|
2019-05-12 00:02:31 +02:00
|
|
|
|
2019-05-14 20:45:48 +02:00
|
|
|
Database db;
|
2019-05-14 22:57:51 +02:00
|
|
|
if (!db)
|
|
|
|
{
|
2019-08-08 20:37:38 +02:00
|
|
|
cerr << "Error: Database could not be opened.\n";
|
|
|
|
return EIO;
|
2019-05-14 22:57:51 +02:00
|
|
|
}
|
|
|
|
|
2019-08-06 17:14:32 +02:00
|
|
|
if (!_uri.empty())
|
2019-05-14 22:57:51 +02:00
|
|
|
{
|
2019-08-06 17:14:32 +02:00
|
|
|
URI uri(_uri);
|
2019-05-16 08:36:35 +02:00
|
|
|
html_extract page = uri.get();
|
2019-08-06 12:13:27 +02:00
|
|
|
if (!page)
|
2019-05-18 13:51:41 +02:00
|
|
|
{
|
2019-05-21 11:59:58 +02:00
|
|
|
cerr << "Error: Could not fetch page.\n";
|
2019-08-06 12:13:27 +02:00
|
|
|
cerr << page.error << endl;
|
2019-08-08 20:37:38 +02:00
|
|
|
return EHOSTUNREACH;
|
2019-05-18 13:51:41 +02:00
|
|
|
}
|
2019-08-06 12:40:52 +02:00
|
|
|
archive_answer archive;
|
2019-08-06 17:14:32 +02:00
|
|
|
if (_archive)
|
2019-05-22 09:43:58 +02:00
|
|
|
{
|
2019-08-06 12:40:52 +02:00
|
|
|
archive = uri.archive();
|
|
|
|
if (!archive)
|
|
|
|
{
|
|
|
|
cerr << "Error archiving URL: " << archive.error << endl;
|
|
|
|
}
|
2019-05-22 09:43:58 +02:00
|
|
|
}
|
2019-08-06 17:14:32 +02:00
|
|
|
db.store({_uri, archive.uri, system_clock::now(), _tags,
|
2019-05-16 00:05:18 +02:00
|
|
|
page.title, page.description, page.fulltext});
|
|
|
|
}
|
|
|
|
|
2019-08-06 17:14:32 +02:00
|
|
|
ofstream file;
|
|
|
|
if (!_file.empty())
|
2019-05-16 08:37:50 +02:00
|
|
|
{
|
2019-08-06 17:14:32 +02:00
|
|
|
file.open(_file);
|
2019-05-16 08:37:50 +02:00
|
|
|
if (!file.good())
|
|
|
|
{
|
2019-08-06 17:14:32 +02:00
|
|
|
cerr << "Error: Could not open file: " << _file << endl;
|
2019-08-08 20:37:38 +02:00
|
|
|
return EIO;
|
2019-05-16 08:37:50 +02:00
|
|
|
}
|
|
|
|
}
|
2019-08-06 17:14:32 +02:00
|
|
|
|
|
|
|
if (_format != export_format::undefined)
|
2019-05-16 00:05:18 +02:00
|
|
|
{
|
2019-08-08 15:19:42 +02:00
|
|
|
list<Database::entry> entries = db.retrieve(_timespan[0], _timespan[1]);
|
2019-08-06 11:20:30 +02:00
|
|
|
|
2019-08-06 17:14:32 +02:00
|
|
|
if (!_search_tags.empty())
|
2019-05-16 08:37:50 +02:00
|
|
|
{
|
2019-08-08 15:36:04 +02:00
|
|
|
Search search(entries);
|
2019-08-06 17:14:32 +02:00
|
|
|
entries = search.search_tags(_search_tags, _regex);
|
2019-05-16 08:37:50 +02:00
|
|
|
}
|
2019-08-06 17:14:32 +02:00
|
|
|
else if (!_search_all.empty())
|
2019-05-19 10:55:45 +02:00
|
|
|
{
|
2019-08-08 15:36:04 +02:00
|
|
|
const size_t len = entries.size();
|
|
|
|
constexpr size_t min_len = 100;
|
2019-08-08 19:06:57 +02:00
|
|
|
constexpr size_t min_per_thread = 50;
|
|
|
|
const size_t n_threads = thread::hardware_concurrency() / 3 + 1;
|
|
|
|
size_t cut_at = len;
|
|
|
|
if (len > min_len)
|
|
|
|
{ // If there are over `min_len` entries, use `n_threads` threads.
|
|
|
|
cut_at = len / n_threads;
|
|
|
|
|
|
|
|
// But don't use less than `min_per_thread` entries per thread.
|
|
|
|
if (cut_at < min_per_thread)
|
|
|
|
{
|
|
|
|
cut_at = min_per_thread;
|
|
|
|
}
|
|
|
|
}
|
2019-08-08 15:36:04 +02:00
|
|
|
|
|
|
|
list<list<Database::entry>> segments;
|
|
|
|
|
|
|
|
// Use threads if list is big.
|
|
|
|
while (entries.size() > cut_at)
|
|
|
|
{
|
|
|
|
list<Database::entry> segment;
|
|
|
|
|
|
|
|
auto it = entries.begin();
|
|
|
|
std::advance(it, cut_at);
|
|
|
|
|
|
|
|
// Move the first `cut_at` entries into `segments`.
|
|
|
|
segment.splice(segment.begin(), entries, entries.begin(), it);
|
|
|
|
segments.push_back(move(segment));
|
|
|
|
}
|
|
|
|
// Move rest of `entries` into `segments`.
|
|
|
|
segments.push_back(move(entries));
|
|
|
|
|
|
|
|
list<thread> threads;
|
|
|
|
for (auto &segment : segments)
|
|
|
|
{
|
|
|
|
thread t(
|
|
|
|
[&]
|
|
|
|
{
|
|
|
|
Search search(segment);
|
|
|
|
// Replace `segment` with `result`.
|
|
|
|
segment = search.search_all(_search_all, _regex);
|
|
|
|
});
|
|
|
|
threads.push_back(move(t));
|
|
|
|
}
|
|
|
|
|
|
|
|
for (thread &t : threads)
|
|
|
|
{
|
|
|
|
t.join();
|
|
|
|
// Move each of `segments` into `entries`.
|
|
|
|
entries.splice(entries.end(), segments.front());
|
|
|
|
segments.pop_front();
|
|
|
|
}
|
2019-05-19 10:55:45 +02:00
|
|
|
}
|
2019-05-19 09:42:52 +02:00
|
|
|
|
2019-08-06 17:14:32 +02:00
|
|
|
switch (_format)
|
2019-05-19 09:42:52 +02:00
|
|
|
{
|
|
|
|
case export_format::csv:
|
2019-05-16 08:37:50 +02:00
|
|
|
{
|
2019-05-19 09:42:52 +02:00
|
|
|
if (file.is_open())
|
|
|
|
{
|
2019-06-06 16:06:25 +02:00
|
|
|
Export::CSV(entries, file).print();
|
2019-05-19 09:42:52 +02:00
|
|
|
file.close();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-06-06 16:06:25 +02:00
|
|
|
Export::CSV(entries).print();
|
2019-05-19 09:42:52 +02:00
|
|
|
}
|
|
|
|
break;
|
2019-05-16 08:37:50 +02:00
|
|
|
}
|
2019-05-19 09:42:52 +02:00
|
|
|
case export_format::asciidoc:
|
2019-05-16 08:37:50 +02:00
|
|
|
{
|
2019-05-19 09:42:52 +02:00
|
|
|
if (file.is_open())
|
|
|
|
{
|
2019-06-06 16:16:57 +02:00
|
|
|
Export::AsciiDoc(entries, file).print();
|
2019-05-19 09:42:52 +02:00
|
|
|
file.close();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-06-06 16:16:57 +02:00
|
|
|
Export::AsciiDoc(entries).print();
|
2019-05-19 09:42:52 +02:00
|
|
|
}
|
|
|
|
break;
|
2019-05-16 08:37:50 +02:00
|
|
|
}
|
2019-05-26 15:47:06 +02:00
|
|
|
case export_format::bookmarks:
|
|
|
|
{
|
|
|
|
if (file.is_open())
|
|
|
|
{
|
2019-06-06 17:11:56 +02:00
|
|
|
Export::Bookmarks(entries, file).print();
|
2019-05-26 15:47:06 +02:00
|
|
|
file.close();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2019-06-06 17:11:56 +02:00
|
|
|
Export::Bookmarks(entries).print();
|
2019-05-26 15:47:06 +02:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2019-07-21 16:37:36 +02:00
|
|
|
case export_format::simple:
|
|
|
|
{
|
|
|
|
if (file.is_open())
|
|
|
|
{
|
|
|
|
Export::Simple(entries, file).print();
|
|
|
|
file.close();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
Export::Simple(entries).print();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2019-05-19 09:42:52 +02:00
|
|
|
default:
|
2019-05-16 08:37:50 +02:00
|
|
|
{
|
2019-05-19 09:42:52 +02:00
|
|
|
break;
|
|
|
|
}
|
2019-05-16 08:37:50 +02:00
|
|
|
}
|
2019-05-14 22:57:51 +02:00
|
|
|
}
|
2019-05-14 20:45:48 +02:00
|
|
|
|
2019-08-08 20:37:38 +02:00
|
|
|
return 0;
|
2019-05-11 02:52:33 +02:00
|
|
|
}
|
2019-08-06 17:14:32 +02:00
|
|
|
|
|
|
|
POCO_APP_MAIN(App)
|