From af1993c71f4c35345f8bb5f88ccd7e7976f8f1dc Mon Sep 17 00:00:00 2001 From: tastytea Date: Sun, 2 Feb 2020 15:01:50 +0100 Subject: [PATCH 1/2] Add hyperlinks to git commands in contributing guidelines. --- CONTRIBUTING.adoc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.adoc b/CONTRIBUTING.adoc index eab82a8..720981f 100644 --- a/CONTRIBUTING.adoc +++ b/CONTRIBUTING.adoc @@ -4,6 +4,8 @@ :contact-email: tastytea@tastytea.de :contact-xmpp: {contact-email} :contact-fediverse: https://likeable.space/users/tastytea +:uri-git-format-patch: https://git-scm.com/docs/git-format-patch +:uri-git-send-email: https://git-scm.com/docs/git-send-email == How to contribute @@ -26,4 +28,5 @@ Please use similar coding conventions as the rest of the project. The basic rule to remember is to write code in the same style as the existing/surrounding code. You can also send me your patches via mailto:{contact-email}[E-Mail], ideally -using `git format-patch` or `git send-email`. +using link:{uri-git-format-patch}[git format-patch] or +link:{uri-git-send-email}[git send-email]. From 3bbc24ba577e5caab57cb9e5abb934483af9e3c8 Mon Sep 17 00:00:00 2001 From: tastytea Date: Sun, 2 Feb 2020 15:02:19 +0100 Subject: [PATCH 2/2] Decrease loops drastically in unescape_html(). --- src/helpers.cpp | 73 ++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/src/helpers.cpp b/src/helpers.cpp index fedcf81..ce5f679 100644 --- a/src/helpers.cpp +++ b/src/helpers.cpp @@ -16,57 +16,34 @@ #include "helpers.hpp" -#include #include #include +#include #include +#include #include -#include namespace mastodonpp { -using std::array; -using std::stol; +using std::stoul; using std::codecvt_utf8; using std::wstring_convert; +using std::map; using std::regex; using std::regex_search; using std::smatch; using std::string_view; using std::move; -using std::pair; string unescape_html(string html) { string buffer{move(html)}; string output; - // Used to convert int to utf-8 char. - wstring_convert, char32_t> u8c; - // Matches numbered entities between 1 and 8 digits, decimal or hexadecimal. - const regex re_entity{"&#(x)?([[:alnum:]]{1,8});"}; - smatch match; - - while (regex_search(buffer, match, re_entity)) - { - const char32_t codepoint{[&match] - { - // 'x' in front of the number means it's hexadecimal, else decimal. - if (match[1].length() == 1) - { - return static_cast(stol(match[2].str(), nullptr, 16)); - } - return static_cast(stol(match[2].str(), nullptr, 10)); - }()}; - output += match.prefix().str() + u8c.to_bytes(codepoint); - buffer = match.suffix().str(); - } - output += buffer; - // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_ // entity_references#Character_entity_references_in_HTML - constexpr array, 258> names + const map names {{ { "exclamation", 0x0021 }, { "quot", 0x0022 }, @@ -328,11 +305,45 @@ string unescape_html(string html) { "diams", 0x2666 } }}; - for (const auto &pair : names) + // Used to convert number to utf-8 char. + wstring_convert, char32_t> u8c; + // Matches numbered entities between 1 and 8 digits, decimal or hexadecimal, + // or named entities. + const regex re_entity{"&(#(x)?([[:alnum:]]{1,8})" + "|[^;[:space:][:punct:]]+);"}; + smatch match; + + while (regex_search(buffer, match, re_entity)) { - const regex re((string("&") += pair.first) += ';'); - output = regex_replace(output, re, u8c.to_bytes(pair.second)); + output += match.prefix().str(); + try + { + const char32_t codepoint{[&match, &names] + { + // If it doesn't start with a '#' it is a named entity. + if (match[1].str()[0] != '#') + { + return names.at(match[1].str()); + } + // 'x' after '#' means the number is hexadecimal. + if (match[2].length() == 1) + { + return static_cast(stoul(match[3].str(), + nullptr, 16)); + } + // '#' without 'x' means the number is decimal. + return static_cast(stoul(match[3].str(), + nullptr, 10)); + }()}; + output += u8c.to_bytes(codepoint); + } + catch (const std::out_of_range &) // Named entity could not be found. + { + output += match.str(); + } + buffer = match.suffix().str(); } + output += buffer; return output; }