Merge branch 'develop' into main

Replace std::regex with Poco::RegularExpression.
This commit is contained in:
tastytea 2019-09-20 18:07:47 +02:00
commit f9ecbdd470
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
5 changed files with 74 additions and 61 deletions

View File

@ -141,11 +141,14 @@ steps:
- alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get' - alias apt-get='rm -f /var/cache/apt/archives/lock && apt-get'
- apt-get update -q - apt-get update -q
- apt-get install -qy g++ cmake pkg-config libpoco-dev libxdg-basedir-dev asciidoc catch - apt-get install -qy g++ cmake pkg-config libpoco-dev libxdg-basedir-dev asciidoc catch
- apt-get install -qy build-essential file - apt-get install -qy build-essential file zip
- rm -rf build && mkdir -p build && cd build - rm -rf build && mkdir -p build && cd build
- cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES .. - cmake -DCMAKE_INSTALL_PREFIX=/usr -DWITH_MOZILLA=YES -DMOZILLA_NMH_DIR="lib/mozilla/native-messaging-hosts" -DWITH_DEB=YES ..
- make package - make package
- cp -v remwharead_${DRONE_TAG}-0_amd64.deb .. - cp -v remwharead_${DRONE_TAG}-0_amd64.deb ..
- cd ../browser-plugins/webextension
- ./build_xpi.sh
- cp -v ../remwharead.xpi ../../
volumes: volumes:
- name: debian-package-cache - name: debian-package-cache
path: /var/cache/apt/archives path: /var/cache/apt/archives
@ -162,6 +165,7 @@ steps:
files: files:
- remwharead_${DRONE_TAG}-0_amd64.deb - remwharead_${DRONE_TAG}-0_amd64.deb
# - remwharead-${DRONE_TAG}-0.x86_64.rpm # - remwharead-${DRONE_TAG}-0.x86_64.rpm
- remwharead.xpi
checksum: checksum:
- sha512 - sha512

View File

@ -144,7 +144,7 @@ namespace remwharead
* *
* @since 0.6.0 * @since 0.6.0
*/ */
const string unescape_html(const string &html); const string unescape_html(string html);
/*! /*!
* @brief Replace newlines with spaces. * @brief Replace newlines with spaces.

View File

@ -16,7 +16,6 @@
#include <iostream> #include <iostream>
#include <string> #include <string>
#include <regex>
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include <locale> #include <locale>
@ -29,8 +28,6 @@ namespace remwharead
using std::string; using std::string;
using std::cerr; using std::cerr;
using std::endl; using std::endl;
using std::regex;
using std::regex_replace;
using tagpair = std::pair<string,list<Database::entry>>; using tagpair = std::pair<string,list<Database::entry>>;
void Export::AsciiDoc::print() const void Export::AsciiDoc::print() const

View File

@ -14,7 +14,6 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>. * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include <regex>
#include <algorithm> #include <algorithm>
#include <locale> #include <locale>
#include <list> #include <list>
@ -22,18 +21,17 @@
#include <utility> #include <utility>
#include <iterator> #include <iterator>
#include <Poco/UTF8String.h> #include <Poco/UTF8String.h>
#include <Poco/RegularExpression.h>
#include "search.hpp" #include "search.hpp"
namespace remwharead namespace remwharead
{ {
using std::list; using std::list;
using std::regex;
using std::regex_search;
using std::smatch;
using std::find; using std::find;
using std::find_if; using std::find_if;
using std::thread; using std::thread;
using std::move; using std::move;
using RegEx = Poco::RegularExpression;
Search::Search(const list<Database::entry> &entries) Search::Search(const list<Database::entry> &entries)
:_entries(entries) :_entries(entries)
@ -43,30 +41,37 @@ namespace remwharead
const const
{ {
vector<vector<string>> searchlist; vector<vector<string>> searchlist;
const regex re_or("(.+?) (OR|\\|\\|) "); const RegEx re_or("(.+?) (OR|\\|\\|) ");
const regex re_and("(.+?) (AND|&&) "); const RegEx re_and("(.+?) (AND|&&) ");
smatch match; RegEx::MatchVec matches;
string::size_type pos = 0;
vector<string> subexpressions; vector<string> subexpressions;
{ // Split expression at OR. { // Split expression at OR.
while (regex_search(expression, match, re_or)) while (re_or.match(expression, pos, matches) != 0)
{ {
subexpressions.push_back(match[1].str()); const string &subexpr = expression.substr(matches[1].offset,
expression = match.suffix().str(); matches[1].length);
subexpressions.push_back(subexpr);
pos = matches[0].offset + matches[0].length;
} }
subexpressions.push_back(expression); subexpressions.push_back(expression.substr(pos));
} }
{ {
for (string sub : subexpressions) for (string sub : subexpressions)
{ // Split each OR-slice at AND. { // Split each OR-slice at AND.
vector<string> terms; vector<string> terms;
while (regex_search(sub, match, re_and)) pos = 0;
while (re_and.match(sub, pos, matches) != 0)
{ {
terms.push_back(to_lowercase(match[1].str())); const string &term = sub.substr(matches[1].offset,
sub = match.suffix().str(); matches[1].length);
terms.push_back(to_lowercase(term));
pos = matches[0].offset + matches[0].length;
} }
terms.push_back(to_lowercase(sub)); terms.push_back(to_lowercase(sub.substr(pos)));
searchlist.push_back(terms); searchlist.push_back(terms);
} }
} }
@ -100,8 +105,8 @@ namespace remwharead
s = to_lowercase(s); s = to_lowercase(s);
if (is_re) if (is_re)
{ {
const regex re("^" + tag + "$"); const RegEx re("^" + tag + "$");
return regex_search(s, re); return (re == s);
} }
else else
{ {
@ -154,19 +159,19 @@ namespace remwharead
// Set matched_* to false if term is not found. // Set matched_* to false if term is not found.
if (is_re) if (is_re)
{ {
const regex re(term); const RegEx re(term);
if(!regex_search(title, re)) if (!(re == title))
{ {
matched_title = false; matched_title = false;
} }
if(!regex_search(description, re)) if (!(re == description))
{ {
matched_description = false; matched_description = false;
} }
if(!regex_search(fulltext, re)) if (!(re == fulltext))
{ {
matched_fulltext = false; matched_fulltext = false;
} }

View File

@ -17,10 +17,10 @@
#include <sstream> #include <sstream>
#include <cstdint> #include <cstdint>
#include <iostream> #include <iostream>
#include <regex>
#include <locale> #include <locale>
#include <codecvt> #include <codecvt>
#include <exception> #include <exception>
#include <vector>
#include <Poco/Net/HTTPClientSession.h> #include <Poco/Net/HTTPClientSession.h>
#include <Poco/Net/HTTPSClientSession.h> #include <Poco/Net/HTTPSClientSession.h>
#include <Poco/Net/HTTPRequest.h> #include <Poco/Net/HTTPRequest.h>
@ -29,20 +29,17 @@
#include <Poco/URI.h> #include <Poco/URI.h>
#include <Poco/Environment.h> #include <Poco/Environment.h>
#include <Poco/Exception.h> #include <Poco/Exception.h>
#include <Poco/RegularExpression.h>
#include "version.hpp" #include "version.hpp"
#include "uri.hpp" #include "uri.hpp"
namespace remwharead namespace remwharead
{ {
using std::regex;
using std::regex_replace;
using std::regex_search;
using std::smatch;
using std::regex_constants::icase;
using std::array; using std::array;
using std::istream; using std::istream;
using std::unique_ptr; using std::unique_ptr;
using std::make_unique; using std::make_unique;
using std::vector;
using Poco::Net::HTTPClientSession; using Poco::Net::HTTPClientSession;
using Poco::Net::HTTPSClientSession; using Poco::Net::HTTPSClientSession;
using Poco::Net::HTTPRequest; using Poco::Net::HTTPRequest;
@ -50,6 +47,7 @@ namespace remwharead
using Poco::Net::HTTPMessage; using Poco::Net::HTTPMessage;
using Poco::StreamCopier; using Poco::StreamCopier;
using Poco::Environment; using Poco::Environment;
using RegEx = Poco::RegularExpression;
html_extract::operator bool() html_extract::operator bool()
{ {
@ -205,12 +203,16 @@ namespace remwharead
const string URI::extract_title(const string &html) const string URI::extract_title(const string &html)
{ {
const regex re_htmlfile("\\.(.?html?|xml|rss)$"); const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile)) if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
{ {
smatch match; const RegEx re_title("<title>([^<]+)", RegEx::RE_CASELESS);
regex_search(html, match, regex("<title>([^<]+)", icase)); vector<string> matches;
return remove_newlines(unescape_html(match[1].str())); re_title.split(html, matches);
if (matches.size() >= 2)
{
return remove_newlines(unescape_html(matches[1]));
}
} }
return ""; return "";
@ -218,13 +220,17 @@ namespace remwharead
const string URI::extract_description(const string &html) const string URI::extract_description(const string &html)
{ {
const regex re_htmlfile("\\.(.?html?|xml|rss)$"); const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile)) if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
{ {
smatch match; const RegEx re_desc("description\"[^>]+content=\"([^\"]+)",
const regex re("description\"[^>]+content=\"([^\"]+)", icase); RegEx::RE_CASELESS);
regex_search(html, match, re); vector<string> matches;
return remove_newlines(strip_html(match[1].str())); re_desc.split(html, matches);
if (matches.size() >= 2)
{
return remove_newlines(unescape_html(matches[1]));
}
} }
return ""; return "";
@ -244,8 +250,9 @@ namespace remwharead
out.replace(pos, 1, ""); out.replace(pos, 1, "");
} }
out = regex_replace(out, regex("\\s+\n"), "\n"); // Remove space at eol. // Remove whitespace at eol.
out = regex_replace(out, regex("\n{2,}"), "\n"); // Reduce newlines. RegEx("\\s+\n").subst(out, "\n", RegEx::RE_GLOBAL);
RegEx("\n{2,}").subst(out, "\n", RegEx::RE_GLOBAL); // Reduce newlines.
return unescape_html(out); return unescape_html(out);
} }
@ -288,32 +295,32 @@ namespace remwharead
return out; return out;
} }
const string URI::unescape_html(const string &html) const string URI::unescape_html(string html)
{ {
string buffer = html;
string output;
// Used to convert int to utf-8 char. // Used to convert int to utf-8 char.
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c; std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
regex re_entity("&#(x)?([[:alnum:]]{1,8});"); const RegEx re_entity("&#(x)?([[:alnum:]]{1,8});");
smatch match; RegEx::MatchVec matches;
string::size_type pos = 0;
while (regex_search(buffer, match, re_entity)) while (re_entity.match(html, pos, matches) != 0)
{ {
char32_t codepoint = 0; char32_t codepoint = 0;
const string number = html.substr(matches[2].offset,
matches[2].length);
// 'x' in front of the number means it's hexadecimal, else decimal. // 'x' in front of the number means it's hexadecimal, else decimal.
if (match[1].length() == 1) if (matches[1].length != 0)
{ {
codepoint = std::stoi(match[2].str(), nullptr, 16); codepoint = std::stoi(number, nullptr, 16);
} }
else else
{ {
codepoint = std::stoi(match[2].str(), nullptr, 10); codepoint = std::stoi(number, nullptr, 10);
} }
output += match.prefix().str() + u8c.to_bytes(codepoint); const string unicode = u8c.to_bytes(codepoint);
buffer = match.suffix().str(); html.replace(matches[0].offset, matches[0].length, unicode);
pos = matches[0].offset + unicode.length();
} }
output += buffer;
// Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_ // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
// entity_references#Character_entity_references_in_HTML // entity_references#Character_entity_references_in_HTML
@ -581,11 +588,11 @@ namespace remwharead
for (auto &pair : names) for (auto &pair : names)
{ {
const regex re('&' + pair.first + ';'); const RegEx re('&' + pair.first + ';');
output = regex_replace(output, re, u8c.to_bytes(pair.second)); re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
} }
return output; return html;
} }
const archive_answer URI::archive() const archive_answer URI::archive()