From 04ef635b8909460f0e028bfa896781e28934eb49 Mon Sep 17 00:00:00 2001 From: tastytea Date: Fri, 20 Sep 2019 21:35:37 +0200 Subject: [PATCH] 2019-09-20 --- docs/adoc_8hpp_source.html | 2 +- docs/annotated.html | 2 +- docs/bookmarks_8hpp_source.html | 2 +- docs/classes.html | 2 +- docs/classremwharead_1_1Database-members.html | 2 +- docs/classremwharead_1_1Database.html | 2 +- ...wharead_1_1Export_1_1AsciiDoc-members.html | 2 +- ...classremwharead_1_1Export_1_1AsciiDoc.html | 6 +-- ...haread_1_1Export_1_1Bookmarks-members.html | 2 +- ...lassremwharead_1_1Export_1_1Bookmarks.html | 6 +-- ...ssremwharead_1_1Export_1_1CSV-members.html | 2 +- docs/classremwharead_1_1Export_1_1CSV.html | 6 +-- ...aread_1_1Export_1_1ExportBase-members.html | 2 +- ...assremwharead_1_1Export_1_1ExportBase.html | 10 ++-- ...sremwharead_1_1Export_1_1JSON-members.html | 2 +- docs/classremwharead_1_1Export_1_1JSON.html | 6 +-- ...ssremwharead_1_1Export_1_1RSS-members.html | 2 +- docs/classremwharead_1_1Export_1_1RSS.html | 6 +-- ...emwharead_1_1Export_1_1Simple-members.html | 2 +- docs/classremwharead_1_1Export_1_1Simple.html | 6 +-- docs/classremwharead_1_1Search-members.html | 2 +- docs/classremwharead_1_1Search.html | 12 ++--- docs/classremwharead_1_1URI-members.html | 4 +- docs/classremwharead_1_1URI.html | 48 +++++++++---------- docs/csv_8hpp_source.html | 2 +- .../dir_529a3e0c72053aaa27e3266821c6a27e.html | 2 +- .../dir_68267d1309a1af8e8297ef4c3efbcdba.html | 2 +- .../dir_a26cd56a311ed2c6b00868ab8167f874.html | 2 +- .../dir_c85d3e3c5052e9ad9ce18c6863244a25.html | 2 +- .../dir_d44c64559bbebec7f509842c48db8b23.html | 2 +- docs/export_8hpp_source.html | 4 +- docs/files.html | 2 +- docs/functions.html | 4 +- docs/functions_func.html | 4 +- docs/functions_rela.html | 2 +- docs/functions_type.html | 2 +- docs/hierarchy.html | 2 +- docs/index.html | 2 +- docs/json_8hpp_source.html | 2 +- docs/remwharead_8hpp_source.html | 2 +- docs/rss_8hpp_source.html | 2 +- docs/search/all_f.js | 2 +- docs/search/functions_a.js | 2 +- docs/search_8hpp_source.html | 10 ++-- docs/simple_8hpp_source.html | 2 +- docs/sqlite_8hpp_source.html | 2 +- ...mwharead_1_1Database_1_1entry-members.html | 2 +- ...structremwharead_1_1Database_1_1entry.html | 2 +- ...remwharead_1_1archive__answer-members.html | 2 +- docs/structremwharead_1_1archive__answer.html | 2 +- ...ctremwharead_1_1html__extract-members.html | 2 +- docs/structremwharead_1_1html__extract.html | 2 +- docs/time_8hpp.html | 2 +- docs/time_8hpp_source.html | 2 +- docs/types_8hpp.html | 2 +- docs/types_8hpp_source.html | 2 +- docs/uri_8hpp_source.html | 24 +++++----- 57 files changed, 120 insertions(+), 120 deletions(-) diff --git a/docs/adoc_8hpp_source.html b/docs/adoc_8hpp_source.html index 2888dde..3370cb0 100644 --- a/docs/adoc_8hpp_source.html +++ b/docs/adoc_8hpp_source.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/annotated.html b/docs/annotated.html index d599a3e..2d016c7 100644 --- a/docs/annotated.html +++ b/docs/annotated.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/bookmarks_8hpp_source.html b/docs/bookmarks_8hpp_source.html index b778026..84bba60 100644 --- a/docs/bookmarks_8hpp_source.html +++ b/docs/bookmarks_8hpp_source.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classes.html b/docs/classes.html index 45d0300..9a974ea 100644 --- a/docs/classes.html +++ b/docs/classes.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Database-members.html b/docs/classremwharead_1_1Database-members.html index a251604..fff1966 100644 --- a/docs/classremwharead_1_1Database-members.html +++ b/docs/classremwharead_1_1Database-members.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Database.html b/docs/classremwharead_1_1Database.html index 5d7b8e2..3b56edf 100644 --- a/docs/classremwharead_1_1Database.html +++ b/docs/classremwharead_1_1Database.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Export_1_1AsciiDoc-members.html b/docs/classremwharead_1_1Export_1_1AsciiDoc-members.html index 8467f57..213a69d 100644 --- a/docs/classremwharead_1_1Export_1_1AsciiDoc-members.html +++ b/docs/classremwharead_1_1Export_1_1AsciiDoc-members.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Export_1_1AsciiDoc.html b/docs/classremwharead_1_1Export_1_1AsciiDoc.html index bda8821..a64ece4 100644 --- a/docs/classremwharead_1_1Export_1_1AsciiDoc.html +++ b/docs/classremwharead_1_1Export_1_1AsciiDoc.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
@@ -101,7 +101,7 @@ Public Member Functions Additional Inherited Members - Protected Member Functions inherited from remwharead::Export::ExportBase const list< Database::entrysort_entries (list< Database::entry > entries) const - Sort entries from newest to oldest. More...
+ Sort entries from newest to oldest and remove duplicates. More...
   ExportBase (const list< Database::entry > &entries, ostream &out=cout)  Export list of Database::entry. More...
@@ -146,7 +146,7 @@ ostream & _out -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
diff --git a/docs/classremwharead_1_1Export_1_1Bookmarks-members.html b/docs/classremwharead_1_1Export_1_1Bookmarks-members.html index 9ca8f64..61619fb 100644 --- a/docs/classremwharead_1_1Export_1_1Bookmarks-members.html +++ b/docs/classremwharead_1_1Export_1_1Bookmarks-members.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Export_1_1Bookmarks.html b/docs/classremwharead_1_1Export_1_1Bookmarks.html index 5b2d0f8..e0bebab 100644 --- a/docs/classremwharead_1_1Export_1_1Bookmarks.html +++ b/docs/classremwharead_1_1Export_1_1Bookmarks.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
@@ -101,7 +101,7 @@ Public Member Functions Additional Inherited Members - Protected Member Functions inherited from remwharead::Export::ExportBase const list< Database::entrysort_entries (list< Database::entry > entries) const - Sort entries from newest to oldest. More...
+ Sort entries from newest to oldest and remove duplicates. More...
   ExportBase (const list< Database::entry > &entries, ostream &out=cout)  Export list of Database::entry. More...
@@ -146,7 +146,7 @@ ostream & _out -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
diff --git a/docs/classremwharead_1_1Export_1_1CSV-members.html b/docs/classremwharead_1_1Export_1_1CSV-members.html index cf1d402..1836f9d 100644 --- a/docs/classremwharead_1_1Export_1_1CSV-members.html +++ b/docs/classremwharead_1_1Export_1_1CSV-members.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Export_1_1CSV.html b/docs/classremwharead_1_1Export_1_1CSV.html index 82dd935..3645b31 100644 --- a/docs/classremwharead_1_1Export_1_1CSV.html +++ b/docs/classremwharead_1_1Export_1_1CSV.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
@@ -101,7 +101,7 @@ Public Member Functions Additional Inherited Members - Protected Member Functions inherited from remwharead::Export::ExportBase const list< Database::entrysort_entries (list< Database::entry > entries) const - Sort entries from newest to oldest. More...
+ Sort entries from newest to oldest and remove duplicates. More...
   ExportBase (const list< Database::entry > &entries, ostream &out=cout)  Export list of Database::entry. More...
@@ -146,7 +146,7 @@ ostream & _out -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
diff --git a/docs/classremwharead_1_1Export_1_1ExportBase-members.html b/docs/classremwharead_1_1Export_1_1ExportBase-members.html index 440c072..1d1024a 100644 --- a/docs/classremwharead_1_1Export_1_1ExportBase-members.html +++ b/docs/classremwharead_1_1Export_1_1ExportBase-members.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
diff --git a/docs/classremwharead_1_1Export_1_1ExportBase.html b/docs/classremwharead_1_1Export_1_1ExportBase.html index 8e35006..2e74ee6 100644 --- a/docs/classremwharead_1_1Export_1_1ExportBase.html +++ b/docs/classremwharead_1_1Export_1_1ExportBase.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
@@ -107,7 +107,7 @@ Public Member Functions

Protected Member Functions

const list< Database::entrysort_entries (list< Database::entry > entries) const - Sort entries from newest to oldest. More...
+ Sort entries from newest to oldest and remove duplicates. More...
  -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
@@ -223,7 +223,7 @@ ostream & 
-

Sort entries from newest to oldest.

+

Sort entries from newest to oldest and remove duplicates.

Parameters

@@ -165,7 +165,7 @@ ostream & 

_out _out
@@ -231,7 +231,7 @@ ostream & 
Returns
Sorted list of Database::entry.
-
31  {
32  entries.sort([](const auto &a, const auto &b)
33  {
34  return (a.datetime > b.datetime);
35  });
36  return entries;
37  }
+
31  {
32  entries.sort([](const auto &a, const auto &b)
33  {
34  return (a.datetime > b.datetime);
35  });
36  entries.unique();
37 
38  return entries;
39  }

The documentation for this class was generated from the following files: diff --git a/docs/classremwharead_1_1Export_1_1JSON.html b/docs/classremwharead_1_1Export_1_1JSON.html index 9512b5c..7cffed0 100644 --- a/docs/classremwharead_1_1Export_1_1JSON.html +++ b/docs/classremwharead_1_1Export_1_1JSON.html @@ -22,7 +22,7 @@ @@ -101,7 +101,7 @@ Public Member Functions Additional Inherited Members - + @@ -146,7 +146,7 @@ ostream &  -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
diff --git a/docs/classremwharead_1_1Export_1_1RSS-members.html b/docs/classremwharead_1_1Export_1_1RSS-members.html index 3e3cdea..199eb9c 100644 --- a/docs/classremwharead_1_1Export_1_1RSS-members.html +++ b/docs/classremwharead_1_1Export_1_1RSS-members.html @@ -22,7 +22,7 @@ diff --git a/docs/classremwharead_1_1Export_1_1RSS.html b/docs/classremwharead_1_1Export_1_1RSS.html index 037c7f2..2f47e7f 100644 --- a/docs/classremwharead_1_1Export_1_1RSS.html +++ b/docs/classremwharead_1_1Export_1_1RSS.html @@ -22,7 +22,7 @@ @@ -101,7 +101,7 @@ Public Member Functions Additional Inherited Members - + @@ -146,7 +146,7 @@ ostream &  -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
diff --git a/docs/classremwharead_1_1Export_1_1Simple-members.html b/docs/classremwharead_1_1Export_1_1Simple-members.html index 8b47e01..42de5b5 100644 --- a/docs/classremwharead_1_1Export_1_1Simple-members.html +++ b/docs/classremwharead_1_1Export_1_1Simple-members.html @@ -22,7 +22,7 @@ diff --git a/docs/classremwharead_1_1Export_1_1Simple.html b/docs/classremwharead_1_1Export_1_1Simple.html index a99ef17..452cbe5 100644 --- a/docs/classremwharead_1_1Export_1_1Simple.html +++ b/docs/classremwharead_1_1Export_1_1Simple.html @@ -22,7 +22,7 @@ @@ -101,7 +101,7 @@ Public Member Functions Additional Inherited Members - + @@ -146,7 +146,7 @@ ostream &  -
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest.
Definition: export.cpp:30
+
25  : _entries(sort_entries(entries))
26  , _out(out)
27  {}
const list< Database::entry > sort_entries(list< Database::entry > entries) const
Sort entries from newest to oldest and remove duplicates.
Definition: export.cpp:30
diff --git a/docs/classremwharead_1_1Search-members.html b/docs/classremwharead_1_1Search-members.html index 0dfccc6..4097cab 100644 --- a/docs/classremwharead_1_1Search-members.html +++ b/docs/classremwharead_1_1Search-members.html @@ -22,7 +22,7 @@ diff --git a/docs/classremwharead_1_1Search.html b/docs/classremwharead_1_1Search.html index d489da6..65fb365 100644 --- a/docs/classremwharead_1_1Search.html +++ b/docs/classremwharead_1_1Search.html @@ -22,7 +22,7 @@ @@ -124,7 +124,7 @@ Public Member Functions

Defines the entries to search.

Since
0.7.0
-
39  :_entries(entries)
40  {}
+
37  :_entries(entries)
38  {}

Member Function Documentation

@@ -165,8 +165,8 @@ Public Member Functions
Returns
List of matching Database::entry.
Since
0.7.0
-
128  {
129  vector<vector<string>> searchlist = parse_expression(expression);
130  list<DB::entry> result = search_tags(expression, is_re);
131 
132  for (const vector<string> &terms_or : searchlist)
133  {
134  for (const DB::entry &entry : _entries)
135  {
136  // Add entry to result if all terms in an OR-slice match title,
137  // description or full text.
138  bool matched_title = true;
139  bool matched_description = true;
140  bool matched_fulltext = true;
141 
142  const auto it = find(result.begin(), result.end(), entry);
143  if (it != result.end())
144  { // Skip if already in result list.
145  continue;
146  }
147 
148  for (const string &term : terms_or)
149  {
150  const string title = to_lowercase(entry.title);
151  const string description = to_lowercase(entry.description);
152  const string fulltext = to_lowercase(entry.fulltext);
153 
154  // Set matched_* to false if term is not found.
155  if (is_re)
156  {
157  const regex re(term);
158 
159  if(!regex_search(title, re))
160  {
161  matched_title = false;
162  }
163 
164  if(!regex_search(description, re))
165  {
166  matched_description = false;
167  }
168 
169  if(!regex_search(fulltext, re))
170  {
171  matched_fulltext = false;
172  }
173  }
174  else
175  {
176  if (title.find(term) == string::npos)
177  {
178  matched_title = false;
179  }
180 
181  if (description.find(term) == string::npos)
182  {
183  matched_description = false;
184  }
185 
186  if (fulltext.find(term) == string::npos)
187  {
188  matched_fulltext = false;
189  }
190  }
191  }
192  if (matched_title == true
193  || matched_description == true
194  || matched_fulltext == true)
195  {
196  result.push_back(entry);
197  }
198  }
199  }
200 
201  return result;
202  }
struct remwharead::Database::entry entry
Describes a database entry.
-
const list< Database::entry > search_tags(string expression, const bool is_re) const
Search in tags of database entries.
Definition: search.cpp:82
+
133  {
134  vector<vector<string>> searchlist = parse_expression(expression);
135  list<DB::entry> result = search_tags(expression, is_re);
136 
137  for (const vector<string> &terms_or : searchlist)
138  {
139  for (const DB::entry &entry : _entries)
140  {
141  // Add entry to result if all terms in an OR-slice match title,
142  // description or full text.
143  bool matched_title = true;
144  bool matched_description = true;
145  bool matched_fulltext = true;
146 
147  const auto it = find(result.begin(), result.end(), entry);
148  if (it != result.end())
149  { // Skip if already in result list.
150  continue;
151  }
152 
153  for (const string &term : terms_or)
154  {
155  const string title = to_lowercase(entry.title);
156  const string description = to_lowercase(entry.description);
157  const string fulltext = to_lowercase(entry.fulltext);
158 
159  // Set matched_* to false if term is not found.
160  if (is_re)
161  {
162  const RegEx re(term);
163 
164  if (!(re == title))
165  {
166  matched_title = false;
167  }
168 
169  if (!(re == description))
170  {
171  matched_description = false;
172  }
173 
174  if (!(re == fulltext))
175  {
176  matched_fulltext = false;
177  }
178  }
179  else
180  {
181  if (title.find(term) == string::npos)
182  {
183  matched_title = false;
184  }
185 
186  if (description.find(term) == string::npos)
187  {
188  matched_description = false;
189  }
190 
191  if (fulltext.find(term) == string::npos)
192  {
193  matched_fulltext = false;
194  }
195  }
196  }
197  if (matched_title == true
198  || matched_description == true
199  || matched_fulltext == true)
200  {
201  result.push_back(entry);
202  }
203  }
204  }
205 
206  return result;
207  }
struct remwharead::Database::entry entry
Describes a database entry.
+
const list< Database::entry > search_tags(string expression, const bool is_re) const
Search in tags of database entries.
Definition: search.cpp:87
@@ -207,7 +207,7 @@ Public Member Functions
Returns
List of matching Database::entry.
Since
0.7.2
-
206  {
207  list<Database::entry> entries = _entries;
208 
209  const size_t len = entries.size();
210  constexpr size_t min_len = 100;
211  constexpr size_t min_per_thread = 50;
212  const size_t n_threads = thread::hardware_concurrency() / 3 + 1;
213  size_t cut_at = len;
214  if (len > min_len)
215  { // If there are over `min_len` entries, use `n_threads` threads.
216  cut_at = len / n_threads;
217 
218  // But don't use less than `min_per_thread` entries per thread.
219  if (cut_at < min_per_thread)
220  {
221  cut_at = min_per_thread;
222  }
223  }
224 
225  list<list<Database::entry>> segments;
226 
227  // Use threads if list is big.
228  while (entries.size() > cut_at)
229  {
230  list<Database::entry> segment;
231 
232  auto it = entries.begin();
233  std::advance(it, cut_at);
234 
235  // Move the first `cut_at` entries into `segments`.
236  segment.splice(segment.begin(), entries, entries.begin(), it);
237  segments.push_back(move(segment));
238  }
239  // Move rest of `entries` into `segments`.
240  segments.push_back(move(entries));
241 
242  list<thread> threads;
243  for (auto &segment : segments)
244  {
245  thread t(
246  [&]
247  {
248  Search search(segment);
249  // Replace `segment` with `result`.
250  segment = search.search_all(expression, is_re);
251  });
252  threads.push_back(move(t));
253  }
254 
255  for (thread &t : threads)
256  {
257  t.join();
258  // Move each of `segments` into `entries`.
259  entries.splice(entries.end(), segments.front());
260  segments.pop_front();
261  }
262 
263  return entries;
264  }
Search(const list< Database::entry > &entries)
Defines the entries to search.
Definition: search.cpp:38
+
211  {
212  list<Database::entry> entries = _entries;
213 
214  const size_t len = entries.size();
215  constexpr size_t min_len = 100;
216  constexpr size_t min_per_thread = 50;
217  const size_t n_threads = thread::hardware_concurrency() / 3 + 1;
218  size_t cut_at = len;
219  if (len > min_len)
220  { // If there are over `min_len` entries, use `n_threads` threads.
221  cut_at = len / n_threads;
222 
223  // But don't use less than `min_per_thread` entries per thread.
224  if (cut_at < min_per_thread)
225  {
226  cut_at = min_per_thread;
227  }
228  }
229 
230  list<list<Database::entry>> segments;
231 
232  // Use threads if list is big.
233  while (entries.size() > cut_at)
234  {
235  list<Database::entry> segment;
236 
237  auto it = entries.begin();
238  std::advance(it, cut_at);
239 
240  // Move the first `cut_at` entries into `segments`.
241  segment.splice(segment.begin(), entries, entries.begin(), it);
242  segments.push_back(move(segment));
243  }
244  // Move rest of `entries` into `segments`.
245  segments.push_back(move(entries));
246 
247  list<thread> threads;
248  for (auto &segment : segments)
249  {
250  thread t(
251  [&]
252  {
253  Search search(segment);
254  // Replace `segment` with `result`.
255  segment = search.search_all(expression, is_re);
256  });
257  threads.push_back(move(t));
258  }
259 
260  for (thread &t : threads)
261  {
262  t.join();
263  // Move each of `segments` into `entries`.
264  entries.splice(entries.end(), segments.front());
265  segments.pop_front();
266  }
267 
268  return entries;
269  }
Search(const list< Database::entry > &entries)
Defines the entries to search.
Definition: search.cpp:36
@@ -248,7 +248,7 @@ Public Member Functions
Returns
List of matching Database::entry.
Since
0.7.0
-
84  {
85  vector<vector<string>> searchlist = parse_expression(expression);
86  list<DB::entry> result;
87 
88  for (const vector<string> &tags_or : searchlist)
89  {
90  for (const DB::entry &entry : _entries)
91  { // Add entry to result if all tags in an OR-slice match.
92  bool matched = true;
93 
94  for (const string &tag : tags_or)
95  {
96  const auto it = find_if(
97  entry.tags.begin(), entry.tags.end(),
98  [&, is_re](string s)
99  {
100  s = to_lowercase(s);
101  if (is_re)
102  {
103  const regex re("^" + tag + "$");
104  return regex_search(s, re);
105  }
106  else
107  {
108  return (s == tag);
109  }
110  });
111  if (it == entry.tags.end())
112  {
113  matched = false;
114  }
115  }
116  if (matched == true)
117  {
118  result.push_back(entry);
119  }
120  }
121  }
122 
123  return result;
124  }
struct remwharead::Database::entry entry
Describes a database entry.
+
89  {
90  vector<vector<string>> searchlist = parse_expression(expression);
91  list<DB::entry> result;
92 
93  for (const vector<string> &tags_or : searchlist)
94  {
95  for (const DB::entry &entry : _entries)
96  { // Add entry to result if all tags in an OR-slice match.
97  bool matched = true;
98 
99  for (const string &tag : tags_or)
100  {
101  const auto it = find_if(
102  entry.tags.begin(), entry.tags.end(),
103  [&, is_re](string s)
104  {
105  s = to_lowercase(s);
106  if (is_re)
107  {
108  const RegEx re("^" + tag + "$");
109  return (re == s);
110  }
111  else
112  {
113  return (s == tag);
114  }
115  });
116  if (it == entry.tags.end())
117  {
118  matched = false;
119  }
120  }
121  if (matched == true)
122  {
123  result.push_back(entry);
124  }
125  }
126  }
127 
128  return result;
129  }
struct remwharead::Database::entry entry
Describes a database entry.
diff --git a/docs/classremwharead_1_1URI-members.html b/docs/classremwharead_1_1URI-members.html index 34a48ae..bba1762 100644 --- a/docs/classremwharead_1_1URI-members.html +++ b/docs/classremwharead_1_1URI-members.html @@ -22,7 +22,7 @@ @@ -82,7 +82,7 @@ $(function() { - +
entriesList of Database::entry to sort.
_out
remwharead -  0.8.0 +  0.8.1
remwharead -  0.8.0 +  0.8.1
- Protected Member Functions inherited from remwharead::Export::ExportBase
const list< Database::entrysort_entries (list< Database::entry > entries) const
 Sort entries from newest to oldest. More...
 Sort entries from newest to oldest and remove duplicates. More...
 
 ExportBase (const list< Database::entry > &entries, ostream &out=cout)
 Export list of Database::entry. More...
_out
remwharead -  0.8.0 +  0.8.1
remwharead -  0.8.0 +  0.8.1
- Protected Member Functions inherited from remwharead::Export::ExportBase
const list< Database::entrysort_entries (list< Database::entry > entries) const
 Sort entries from newest to oldest. More...
 Sort entries from newest to oldest and remove duplicates. More...
 
 ExportBase (const list< Database::entry > &entries, ostream &out=cout)
 Export list of Database::entry. More...
_out
remwharead -  0.8.0 +  0.8.1
remwharead -  0.8.0 +  0.8.1
- Protected Member Functions inherited from remwharead::Export::ExportBase
const list< Database::entrysort_entries (list< Database::entry > entries) const
 Sort entries from newest to oldest. More...
 Sort entries from newest to oldest and remove duplicates. More...
 
 ExportBase (const list< Database::entry > &entries, ostream &out=cout)
 Export list of Database::entry. More...
_out
remwharead -  0.8.0 +  0.8.1
remwharead -  0.8.0 +  0.8.1
remwharead -  0.8.0 +  0.8.1
remove_html_tags(const string &html, const string &tag="")remwharead::URIprotected
remove_newlines(string text)remwharead::URIprotected
strip_html(const string &html)remwharead::URIprotected
unescape_html(const string &html)remwharead::URIprotected
unescape_html(string html)remwharead::URIprotected
URI(const string &uri)remwharead::URIexplicit
~URI() (defined in remwharead::URI)remwharead::URIvirtual
diff --git a/docs/classremwharead_1_1URI.html b/docs/classremwharead_1_1URI.html index 1ca2408..8962b6f 100644 --- a/docs/classremwharead_1_1URI.html +++ b/docs/classremwharead_1_1URI.html @@ -22,7 +22,7 @@
remwharead -  0.8.0 +  0.8.1
@@ -110,9 +110,9 @@ Protected Member Functions const string remove_html_tags (const string &html, const string &tag="")  Remove HTML tags. More...
  -const string unescape_html (const string &html) - Convert HTML entities to UTF-8. More...
-  +const string unescape_html (string html) + Convert HTML entities to UTF-8. More...
+  const string remove_newlines (string text)  Replace newlines with spaces. More...
  @@ -154,7 +154,7 @@ string _uri

Construct object and set URL.

Initializes TLS and sets proxy from the environment variable http_proxy, if possible.

Since
0.6.0
-
65  :_uri(uri)
66  {
67  Poco::Net::initializeSSL();
68 
69  try
70  {
71  HTTPClientSession::ProxyConfig proxy;
72  string proxy_env = Environment::get("http_proxy");
73  size_t pos;
74 
75  // Only keep text between // and /.
76  if ((pos = proxy_env.find("//")) != string::npos)
77  {
78  proxy_env = proxy_env.substr(pos + 2);
79  }
80  if ((pos = proxy_env.find('/')) != string::npos)
81  {
82  proxy_env = proxy_env.substr(0, pos);
83  }
84 
85  if ((pos = proxy_env.find(':')) != string::npos)
86  {
87  proxy.host = proxy_env.substr(0, pos);
88  proxy.port = std::stoi(proxy_env.substr(pos + 1));
89  }
90  else
91  {
92  proxy.host = proxy_env;
93  }
94 
95  HTTPClientSession::setGlobalProxyConfig(proxy);
96  }
97  catch (const std::exception &)
98  {
99  // No proxy found, no problem.
100  }
101  }
+
63  :_uri(uri)
64  {
65  Poco::Net::initializeSSL();
66 
67  try
68  {
69  HTTPClientSession::ProxyConfig proxy;
70  string proxy_env = Environment::get("http_proxy");
71  size_t pos;
72 
73  // Only keep text between // and /.
74  if ((pos = proxy_env.find("//")) != string::npos)
75  {
76  proxy_env = proxy_env.substr(pos + 2);
77  }
78  if ((pos = proxy_env.find('/')) != string::npos)
79  {
80  proxy_env = proxy_env.substr(0, pos);
81  }
82 
83  if ((pos = proxy_env.find(':')) != string::npos)
84  {
85  proxy.host = proxy_env.substr(0, pos);
86  proxy.port = std::stoi(proxy_env.substr(pos + 1));
87  }
88  else
89  {
90  proxy.host = proxy_env;
91  }
92 
93  HTTPClientSession::setGlobalProxyConfig(proxy);
94  }
95  catch (const std::exception &)
96  {
97  // No proxy found, no problem.
98  }
99  }

Member Function Documentation

@@ -175,7 +175,7 @@ string _uri

Save URI in archive and return archive-URI.

Since
0.6.0
-
592  {
593  if (_uri.substr(0, 4) != "http")
594  {
595  return { false, "Only HTTP(S) is archivable.", "" };
596  }
597 
598  try
599  {
600  const string answer = make_request("https://web.archive.org/save/"
601  + _uri, true);
602 
603  if (!answer.empty())
604  {
605  return { true, "", "https://web.archive.org" + answer };
606  }
607  }
608  catch (const Poco::Exception &e)
609  {
610  return { false, e.displayText(), "" };
611  }
612 
613  return { false, "Unknown error.", "" };
614  }
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
+
599  {
600  if (_uri.substr(0, 4) != "http")
601  {
602  return { false, "Only HTTP(S) is archivable.", "" };
603  }
604 
605  try
606  {
607  const string answer = make_request("https://web.archive.org/save/"
608  + _uri, true);
609 
610  if (!answer.empty())
611  {
612  return { true, "", "https://web.archive.org" + answer };
613  }
614  }
615  catch (const Poco::Exception &e)
616  {
617  return { false, e.displayText(), "" };
618  }
619 
620  return { false, "Unknown error.", "" };
621  }
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:131
@@ -205,8 +205,8 @@ string _uri

Extract the description from an HTML page.

Since
0.6.0
-
220  {
221  const regex re_htmlfile("\\.(.?html?|xml|rss)$");
222  if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile))
223  {
224  smatch match;
225  const regex re("description\"[^>]+content=\"([^\"]+)", icase);
226  regex_search(html, match, re);
227  return remove_newlines(strip_html(match[1].str()));
228  }
229 
230  return "";
231  }
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:233
-
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:616
+
222  {
223  const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
224  if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
225  {
226  const RegEx re_desc("description\"[^>]+content=\"([^\"]+)",
227  RegEx::RE_CASELESS);
228  vector<string> matches;
229  re_desc.split(html, matches);
230  if (matches.size() >= 2)
231  {
232  return remove_newlines(unescape_html(matches[1]));
233  }
234  }
235 
236  return "";
237  }
const string unescape_html(string html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:298
+
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:623
@@ -236,8 +236,8 @@ string _uri

Extract the title from an HTML page.

Since
0.6.0
-
207  {
208  const regex re_htmlfile("\\.(.?html?|xml|rss)$");
209  if (_uri.substr(0, 4) == "http" || regex_search(_uri, re_htmlfile))
210  {
211  smatch match;
212  regex_search(html, match, regex("<title>([^<]+)", icase));
213  return remove_newlines(unescape_html(match[1].str()));
214  }
215 
216  return "";
217  }
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:616
-
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:291
+
205  {
206  const RegEx re_htmlfile(".*\\.(.?html?|xml|rss)$", RegEx::RE_CASELESS);
207  if (_uri.substr(0, 4) == "http" || re_htmlfile.match(_uri))
208  {
209  const RegEx re_title("<title>([^<]+)", RegEx::RE_CASELESS);
210  vector<string> matches;
211  re_title.split(html, matches);
212  if (matches.size() >= 2)
213  {
214  return remove_newlines(unescape_html(matches[1]));
215  }
216  }
217 
218  return "";
219  }
const string unescape_html(string html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:298
+
const string remove_newlines(string text)
Replace newlines with spaces.
Definition: uri.cpp:623
@@ -258,10 +258,10 @@ string _uri

Download URI and extract title, description and full text.

Since
0.6.0
-
109  {
110  try
111  {
112  const string answer = make_request(_uri);
113  if (!answer.empty())
114  {
115  return
116  {
117  true,
118  "",
119  extract_title(answer),
120  extract_description(answer),
121  strip_html(answer)
122  };
123  }
124  }
125  catch (const Poco::Exception &e)
126  {
127  return { false, e.displayText(), "", "", "" };
128  }
129 
130  return { false, "Unknown error.", "", "", "" };
131  }
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:233
-
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:206
-
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
-
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:219
+
107  {
108  try
109  {
110  const string answer = make_request(_uri);
111  if (!answer.empty())
112  {
113  return
114  {
115  true,
116  "",
117  extract_title(answer),
118  extract_description(answer),
119  strip_html(answer)
120  };
121  }
122  }
123  catch (const Poco::Exception &e)
124  {
125  return { false, e.displayText(), "", "", "" };
126  }
127 
128  return { false, "Unknown error.", "", "", "" };
129  }
const string strip_html(const string &html)
Removes HTML tags and superflous spaces from an HTML page.
Definition: uri.cpp:239
+
const string extract_title(const string &html)
Extract the title from an HTML page.
Definition: uri.cpp:204
+
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:131
+
const string extract_description(const string &html)
Extract the description from an HTML page.
Definition: uri.cpp:221
@@ -301,8 +301,8 @@ string _uri

Make a HTTP(S) request.

Since
0.6.0
-
134  {
135  Poco::URI poco_uri(uri);
136  string method =
137  archive ? HTTPRequest::HTTP_HEAD : HTTPRequest::HTTP_GET;
138  string path = poco_uri.getPathAndQuery();
139  if (path.empty())
140  {
141  path = "/";
142  }
143 
144  unique_ptr<HTTPClientSession> session;
145  if (poco_uri.getScheme() == "https")
146  {
147  session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
148  poco_uri.getPort());
149  }
150  else if (poco_uri.getScheme() == "http")
151  {
152  session = make_unique<HTTPClientSession>(poco_uri.getHost(),
153  poco_uri.getPort());
154  }
155  else
156  {
157  throw Poco::Exception("Protocol not supported.");
158  }
159 
160  HTTPRequest request(method, path, HTTPMessage::HTTP_1_1);
161  request.set("User-Agent", string("remwharead/") + global::version);
162 
163  HTTPResponse response;
164 
165  session->sendRequest(request);
166  istream &rs = session->receiveResponse(response);
167 
168  // Not using the constants because some are too new for Debian stretch.
169  switch (response.getStatus())
170  {
171  case 301: // HTTPResponse::HTTP_MOVED_PERMANENTLY
172  case 308: // HTTPResponse::HTTP_PERMANENT_REDIRECT
173  case 302: // HTTPResponse::HTTP_FOUND
174  case 303: // HTTPResponse::HTTP_SEE_OTHER
175  case 307: // HTTPResponse::HTTP_TEMPORARY_REDIRECT
176  {
177  string location = response.get("Location");
178  if (location.substr(0, 4) != "http")
179  {
180  location = poco_uri.getScheme() + "://" + poco_uri.getHost()
181  + location;
182  }
183  return make_request(location);
184  }
185  case HTTPResponse::HTTP_OK:
186  {
187  string answer;
188  if (archive)
189  {
190  answer = response.get("Content-Location");
191  }
192  else
193  {
194  StreamCopier::copyToString(rs, answer);
195  }
196  return answer;
197  }
198  default:
199  {
200  throw Poco::Exception(response.getReason());
201  return "";
202  }
203  }
204  }
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:133
-
const archive_answer archive()
Save URI in archive and return archive-URI.
Definition: uri.cpp:591
+
132  {
133  Poco::URI poco_uri(uri);
134  string method =
135  archive ? HTTPRequest::HTTP_HEAD : HTTPRequest::HTTP_GET;
136  string path = poco_uri.getPathAndQuery();
137  if (path.empty())
138  {
139  path = "/";
140  }
141 
142  unique_ptr<HTTPClientSession> session;
143  if (poco_uri.getScheme() == "https")
144  {
145  session = make_unique<HTTPSClientSession>(poco_uri.getHost(),
146  poco_uri.getPort());
147  }
148  else if (poco_uri.getScheme() == "http")
149  {
150  session = make_unique<HTTPClientSession>(poco_uri.getHost(),
151  poco_uri.getPort());
152  }
153  else
154  {
155  throw Poco::Exception("Protocol not supported.");
156  }
157 
158  HTTPRequest request(method, path, HTTPMessage::HTTP_1_1);
159  request.set("User-Agent", string("remwharead/") + global::version);
160 
161  HTTPResponse response;
162 
163  session->sendRequest(request);
164  istream &rs = session->receiveResponse(response);
165 
166  // Not using the constants because some are too new for Debian stretch.
167  switch (response.getStatus())
168  {
169  case 301: // HTTPResponse::HTTP_MOVED_PERMANENTLY
170  case 308: // HTTPResponse::HTTP_PERMANENT_REDIRECT
171  case 302: // HTTPResponse::HTTP_FOUND
172  case 303: // HTTPResponse::HTTP_SEE_OTHER
173  case 307: // HTTPResponse::HTTP_TEMPORARY_REDIRECT
174  {
175  string location = response.get("Location");
176  if (location.substr(0, 4) != "http")
177  {
178  location = poco_uri.getScheme() + "://" + poco_uri.getHost()
179  + location;
180  }
181  return make_request(location);
182  }
183  case HTTPResponse::HTTP_OK:
184  {
185  string answer;
186  if (archive)
187  {
188  answer = response.get("Content-Location");
189  }
190  else
191  {
192  StreamCopier::copyToString(rs, answer);
193  }
194  return answer;
195  }
196  default:
197  {
198  throw Poco::Exception(response.getReason());
199  return "";
200  }
201  }
202  }
const string make_request(const string &uri, bool archive=false) const
Make a HTTP(S) request.
Definition: uri.cpp:131
+
const archive_answer archive()
Save URI in archive and return archive-URI.
Definition: uri.cpp:598
@@ -349,7 +349,7 @@ string _uri
Since
0.6.0
-
254  {
255  // NOTE: I did this with regex_replace before, but libstdc++ segfaulted.
256  string out;
257  if (tag.empty())
258  {
259  size_t pos = 0;
260  while (pos != std::string::npos)
261  {
262  size_t startpos = html.find('<', pos);
263  size_t endpos = html.find('>', startpos);
264  out += html.substr(pos, startpos - pos);
265  pos = endpos;
266  if (pos != std::string::npos)
267  {
268  ++pos;
269  }
270  }
271  }
272  else
273  {
274  size_t pos = 0;
275  out = html;
276  while ((pos = out.find("<" + tag)) != std::string::npos)
277  {
278  size_t endpos = out.find("</" + tag, pos);
279  if (endpos == std::string::npos)
280  {
281  break;
282  }
283  endpos += 3 + tag.length(); // tag + </ + >
284  out.replace(pos, endpos - pos, "");
285  }
286  }
287 
288  return out;
289  }
+
261  {
262  // NOTE: I did this with regex_replace before, but libstdc++ segfaulted.
263  string out;
264  if (tag.empty())
265  {
266  size_t pos = 0;
267  while (pos != std::string::npos)
268  {
269  size_t startpos = html.find('<', pos);
270  size_t endpos = html.find('>', startpos);
271  out += html.substr(pos, startpos - pos);
272  pos = endpos;
273  if (pos != std::string::npos)
274  {
275  ++pos;
276  }
277  }
278  }
279  else
280  {
281  size_t pos = 0;
282  out = html;
283  while ((pos = out.find("<" + tag)) != std::string::npos)
284  {
285  size_t endpos = out.find("</" + tag, pos);
286  if (endpos == std::string::npos)
287  {
288  break;
289  }
290  endpos += 3 + tag.length(); // tag + </ + >
291  out.replace(pos, endpos - pos, "");
292  }
293  }
294 
295  return out;
296  }
@@ -378,7 +378,7 @@ string _uri

Replace newlines with spaces.

Since
0.6.0
-
617  {
618  size_t posn = 0;
619  while ((posn = text.find('\n', posn)) != std::string::npos)
620  {
621  text.replace(posn, 1, " ");
622 
623  size_t posr = posn - 1;
624  if (text[posr] == '\r')
625  {
626  text.replace(posr, 1, " ");
627  }
628  ++posn;
629  }
630 
631  return text;
632  }
+
624  {
625  size_t posn = 0;
626  while ((posn = text.find('\n', posn)) != std::string::npos)
627  {
628  text.replace(posn, 1, " ");
629 
630  size_t posr = posn - 1;
631  if (text[posr] == '\r')
632  {
633  text.replace(posr, 1, " ");
634  }
635  ++posn;
636  }
637 
638  return text;
639  }
@@ -407,13 +407,13 @@ string _uri

Removes HTML tags and superflous spaces from an HTML page.

Since
0.6.0
-
234  {
235  string out;
236 
237  out = remove_html_tags(html, "script"); // Remove JavaScript.
238  out = remove_html_tags(out, "style"); // Remove CSS.
239  out = remove_html_tags(out); // Remove tags.
240 
241  size_t pos = 0;
242  while ((pos = out.find("\r", pos)) != std::string::npos) // Remove CR.
243  {
244  out.replace(pos, 1, "");
245  }
246 
247  out = regex_replace(out, regex("\\s+\n"), "\n"); // Remove space at eol.
248  out = regex_replace(out, regex("\n{2,}"), "\n"); // Reduce newlines.
249 
250  return unescape_html(out);
251  }
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:253
-
const string unescape_html(const string &html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:291
+
240  {
241  string out;
242 
243  out = remove_html_tags(html, "script"); // Remove JavaScript.
244  out = remove_html_tags(out, "style"); // Remove CSS.
245  out = remove_html_tags(out); // Remove tags.
246 
247  size_t pos = 0;
248  while ((pos = out.find("\r", pos)) != std::string::npos) // Remove CR.
249  {
250  out.replace(pos, 1, "");
251  }
252 
253  // Remove whitespace at eol.
254  RegEx("\\s+\n").subst(out, "\n", RegEx::RE_GLOBAL);
255  RegEx("\n{2,}").subst(out, "\n", RegEx::RE_GLOBAL); // Reduce newlines.
256 
257  return unescape_html(out);
258  }
const string unescape_html(string html)
Convert HTML entities to UTF-8.
Definition: uri.cpp:298
+
const string remove_html_tags(const string &html, const string &tag="")
Remove HTML tags.
Definition: uri.cpp:260
- -

◆ unescape_html()

+ +

◆ unescape_html()

@@ -424,7 +424,7 @@ string _uri const string remwharead::URI::unescape_html ( - const string &  + string  html) @@ -438,7 +438,7 @@ string _uri

Convert HTML entities to UTF-8.

Since
0.6.0
-
292  {
293  string buffer = html;
294  string output;
295 
296  // Used to convert int to utf-8 char.
297  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
298  regex re_entity("&#(x)?([[:alnum:]]{1,8});");
299  smatch match;
300 
301  while (regex_search(buffer, match, re_entity))
302  {
303  char32_t codepoint = 0;
304  // 'x' in front of the number means it's hexadecimal, else decimal.
305  if (match[1].length() == 1)
306  {
307  codepoint = std::stoi(match[2].str(), nullptr, 16);
308  }
309  else
310  {
311  codepoint = std::stoi(match[2].str(), nullptr, 10);
312  }
313  output += match.prefix().str() + u8c.to_bytes(codepoint);
314  buffer = match.suffix().str();
315  }
316  output += buffer;
317 
318  // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
319  // entity_references#Character_entity_references_in_HTML
320  const array<const std::pair<const string, const char32_t>, 258> names =
321  {{
322  { "exclamation", 0x0021 },
323  { "quot", 0x0022 },
324  { "percent", 0x0025 },
325  { "amp", 0x0026 },
326  { "apos", 0x0027 },
327  { "add", 0x002B },
328  { "lt", 0x003C },
329  { "equal", 0x003D },
330  { "gt", 0x003E },
331  { "nbsp", 0x00A0 },
332  { "iexcl", 0x00A1 },
333  { "cent", 0x00A2 },
334  { "pound", 0x00A3 },
335  { "curren", 0x00A4 },
336  { "yen", 0x00A5 },
337  { "brvbar", 0x00A6 },
338  { "sect", 0x00A7 },
339  { "uml", 0x00A8 },
340  { "copy", 0x00A9 },
341  { "ordf", 0x00AA },
342  { "laquo", 0x00AB },
343  { "not", 0x00AC },
344  { "shy", 0x00AD },
345  { "reg", 0x00AE },
346  { "macr", 0x00AF },
347  { "deg", 0x00B0 },
348  { "plusmn", 0x00B1 },
349  { "sup2", 0x00B2 },
350  { "sup3", 0x00B3 },
351  { "acute", 0x00B4 },
352  { "micro", 0x00B5 },
353  { "para", 0x00B6 },
354  { "middot", 0x00B7 },
355  { "cedil", 0x00B8 },
356  { "sup1", 0x00B9 },
357  { "ordm", 0x00BA },
358  { "raquo", 0x00BB },
359  { "frac14", 0x00BC },
360  { "frac12", 0x00BD },
361  { "frac34", 0x00BE },
362  { "iquest", 0x00BF },
363  { "Agrave", 0x00C0 },
364  { "Aacute", 0x00C1 },
365  { "Acirc", 0x00C2 },
366  { "Atilde", 0x00C3 },
367  { "Auml", 0x00C4 },
368  { "Aring", 0x00C5 },
369  { "AElig", 0x00C6 },
370  { "Ccedil", 0x00C7 },
371  { "Egrave", 0x00C8 },
372  { "Eacute", 0x00C9 },
373  { "Ecirc", 0x00CA },
374  { "Euml", 0x00CB },
375  { "Igrave", 0x00CC },
376  { "Iacute", 0x00CD },
377  { "Icirc", 0x00CE },
378  { "Iuml", 0x00CF },
379  { "ETH", 0x00D0 },
380  { "Ntilde", 0x00D1 },
381  { "Ograve", 0x00D2 },
382  { "Oacute", 0x00D3 },
383  { "Ocirc", 0x00D4 },
384  { "Otilde", 0x00D5 },
385  { "Ouml", 0x00D6 },
386  { "times", 0x00D7 },
387  { "Oslash", 0x00D8 },
388  { "Ugrave", 0x00D9 },
389  { "Uacute", 0x00DA },
390  { "Ucirc", 0x00DB },
391  { "Uuml", 0x00DC },
392  { "Yacute", 0x00DD },
393  { "THORN", 0x00DE },
394  { "szlig", 0x00DF },
395  { "agrave", 0x00E0 },
396  { "aacute", 0x00E1 },
397  { "acirc", 0x00E2 },
398  { "atilde", 0x00E3 },
399  { "auml", 0x00E4 },
400  { "aring", 0x00E5 },
401  { "aelig", 0x00E6 },
402  { "ccedil", 0x00E7 },
403  { "egrave", 0x00E8 },
404  { "eacute", 0x00E9 },
405  { "ecirc", 0x00EA },
406  { "euml", 0x00EB },
407  { "igrave", 0x00EC },
408  { "iacute", 0x00ED },
409  { "icirc", 0x00EE },
410  { "iuml", 0x00EF },
411  { "eth", 0x00F0 },
412  { "ntilde", 0x00F1 },
413  { "ograve", 0x00F2 },
414  { "oacute", 0x00F3 },
415  { "ocirc", 0x00F4 },
416  { "otilde", 0x00F5 },
417  { "ouml", 0x00F6 },
418  { "divide", 0x00F7 },
419  { "oslash", 0x00F8 },
420  { "ugrave", 0x00F9 },
421  { "uacute", 0x00FA },
422  { "ucirc", 0x00FB },
423  { "uuml", 0x00FC },
424  { "yacute", 0x00FD },
425  { "thorn", 0x00FE },
426  { "yuml", 0x00FF },
427  { "OElig", 0x0152 },
428  { "oelig", 0x0153 },
429  { "Scaron", 0x0160 },
430  { "scaron", 0x0161 },
431  { "Yuml", 0x0178 },
432  { "fnof", 0x0192 },
433  { "circ", 0x02C6 },
434  { "tilde", 0x02DC },
435  { "Alpha", 0x0391 },
436  { "Beta", 0x0392 },
437  { "Gamma", 0x0393 },
438  { "Delta", 0x0394 },
439  { "Epsilon", 0x0395 },
440  { "Zeta", 0x0396 },
441  { "Eta", 0x0397 },
442  { "Theta", 0x0398 },
443  { "Iota", 0x0399 },
444  { "Kappa", 0x039A },
445  { "Lambda", 0x039B },
446  { "Mu", 0x039C },
447  { "Nu", 0x039D },
448  { "Xi", 0x039E },
449  { "Omicron", 0x039F },
450  { "Pi", 0x03A0 },
451  { "Rho", 0x03A1 },
452  { "Sigma", 0x03A3 },
453  { "Tau", 0x03A4 },
454  { "Upsilon", 0x03A5 },
455  { "Phi", 0x03A6 },
456  { "Chi", 0x03A7 },
457  { "Psi", 0x03A8 },
458  { "Omega", 0x03A9 },
459  { "alpha", 0x03B1 },
460  { "beta", 0x03B2 },
461  { "gamma", 0x03B3 },
462  { "delta", 0x03B4 },
463  { "epsilon", 0x03B5 },
464  { "zeta", 0x03B6 },
465  { "eta", 0x03B7 },
466  { "theta", 0x03B8 },
467  { "iota", 0x03B9 },
468  { "kappa", 0x03BA },
469  { "lambda", 0x03BB },
470  { "mu", 0x03BC },
471  { "nu", 0x03BD },
472  { "xi", 0x03BE },
473  { "omicron", 0x03BF },
474  { "pi", 0x03C0 },
475  { "rho", 0x03C1 },
476  { "sigmaf", 0x03C2 },
477  { "sigma", 0x03C3 },
478  { "tau", 0x03C4 },
479  { "upsilon", 0x03C5 },
480  { "phi", 0x03C6 },
481  { "chi", 0x03C7 },
482  { "psi", 0x03C8 },
483  { "omega", 0x03C9 },
484  { "thetasym", 0x03D1 },
485  { "upsih", 0x03D2 },
486  { "piv", 0x03D6 },
487  { "ensp", 0x2002 },
488  { "emsp", 0x2003 },
489  { "thinsp", 0x2009 },
490  { "zwnj", 0x200C },
491  { "zwj", 0x200D },
492  { "lrm", 0x200E },
493  { "rlm", 0x200F },
494  { "ndash", 0x2013 },
495  { "mdash", 0x2014 },
496  { "horbar", 0x2015 },
497  { "lsquo", 0x2018 },
498  { "rsquo", 0x2019 },
499  { "sbquo", 0x201A },
500  { "ldquo", 0x201C },
501  { "rdquo", 0x201D },
502  { "bdquo", 0x201E },
503  { "dagger", 0x2020 },
504  { "Dagger", 0x2021 },
505  { "bull", 0x2022 },
506  { "hellip", 0x2026 },
507  { "permil", 0x2030 },
508  { "prime", 0x2032 },
509  { "Prime", 0x2033 },
510  { "lsaquo", 0x2039 },
511  { "rsaquo", 0x203A },
512  { "oline", 0x203E },
513  { "frasl", 0x2044 },
514  { "euro", 0x20AC },
515  { "image", 0x2111 },
516  { "weierp", 0x2118 },
517  { "real", 0x211C },
518  { "trade", 0x2122 },
519  { "alefsym", 0x2135 },
520  { "larr", 0x2190 },
521  { "uarr", 0x2191 },
522  { "rarr", 0x2192 },
523  { "darr", 0x2193 },
524  { "harr", 0x2194 },
525  { "crarr", 0x21B5 },
526  { "lArr", 0x21D0 },
527  { "uArr", 0x21D1 },
528  { "rArr", 0x21D2 },
529  { "dArr", 0x21D3 },
530  { "hArr", 0x21D4 },
531  { "forall", 0x2200 },
532  { "part", 0x2202 },
533  { "exist", 0x2203 },
534  { "empty", 0x2205 },
535  { "nabla", 0x2207 },
536  { "isin", 0x2208 },
537  { "notin", 0x2209 },
538  { "ni", 0x220B },
539  { "prod", 0x220F },
540  { "sum", 0x2211 },
541  { "minus", 0x2212 },
542  { "lowast", 0x2217 },
543  { "radic", 0x221A },
544  { "prop", 0x221D },
545  { "infin", 0x221E },
546  { "ang", 0x2220 },
547  { "and", 0x2227 },
548  { "or", 0x2228 },
549  { "cap", 0x2229 },
550  { "cup", 0x222A },
551  { "int", 0x222B },
552  { "there4", 0x2234 },
553  { "sim", 0x223C },
554  { "cong", 0x2245 },
555  { "asymp", 0x2248 },
556  { "ne", 0x2260 },
557  { "equiv", 0x2261 },
558  { "le", 0x2264 },
559  { "ge", 0x2265 },
560  { "sub", 0x2282 },
561  { "sup", 0x2283 },
562  { "nsub", 0x2284 },
563  { "sube", 0x2286 },
564  { "supe", 0x2287 },
565  { "oplus", 0x2295 },
566  { "otimes", 0x2297 },
567  { "perp", 0x22A5 },
568  { "sdot", 0x22C5 },
569  { "lceil", 0x2308 },
570  { "rceil", 0x2309 },
571  { "lfloor", 0x230A },
572  { "rfloor", 0x230B },
573  { "lang", 0x2329 },
574  { "rang", 0x232A },
575  { "loz", 0x25CA },
576  { "spades", 0x2660 },
577  { "clubs", 0x2663 },
578  { "hearts", 0x2665 },
579  { "diams", 0x2666 }
580  }};
581 
582  for (auto &pair : names)
583  {
584  const regex re('&' + pair.first + ';');
585  output = regex_replace(output, re, u8c.to_bytes(pair.second));
586  }
587 
588  return output;
589  }
+
299  {
300  // Used to convert int to utf-8 char.
301  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
302  const RegEx re_entity("&#(x)?([[:alnum:]]{1,8});");
303  RegEx::MatchVec matches;
304  string::size_type pos = 0;
305 
306  while (re_entity.match(html, pos, matches) != 0)
307  {
308  char32_t codepoint = 0;
309  const string number = html.substr(matches[2].offset,
310  matches[2].length);
311  // 'x' in front of the number means it's hexadecimal, else decimal.
312  if (matches[1].length != 0)
313  {
314  codepoint = std::stoi(number, nullptr, 16);
315  }
316  else
317  {
318  codepoint = std::stoi(number, nullptr, 10);
319  }
320  const string unicode = u8c.to_bytes(codepoint);
321  html.replace(matches[0].offset, matches[0].length, unicode);
322  pos = matches[0].offset + unicode.length();
323  }
324 
325  // Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
326  // entity_references#Character_entity_references_in_HTML
327  const array<const std::pair<const string, const char32_t>, 258> names =
328  {{
329  { "exclamation", 0x0021 },
330  { "quot", 0x0022 },
331  { "percent", 0x0025 },
332  { "amp", 0x0026 },
333  { "apos", 0x0027 },
334  { "add", 0x002B },
335  { "lt", 0x003C },
336  { "equal", 0x003D },
337  { "gt", 0x003E },
338  { "nbsp", 0x00A0 },
339  { "iexcl", 0x00A1 },
340  { "cent", 0x00A2 },
341  { "pound", 0x00A3 },
342  { "curren", 0x00A4 },
343  { "yen", 0x00A5 },
344  { "brvbar", 0x00A6 },
345  { "sect", 0x00A7 },
346  { "uml", 0x00A8 },
347  { "copy", 0x00A9 },
348  { "ordf", 0x00AA },
349  { "laquo", 0x00AB },
350  { "not", 0x00AC },
351  { "shy", 0x00AD },
352  { "reg", 0x00AE },
353  { "macr", 0x00AF },
354  { "deg", 0x00B0 },
355  { "plusmn", 0x00B1 },
356  { "sup2", 0x00B2 },
357  { "sup3", 0x00B3 },
358  { "acute", 0x00B4 },
359  { "micro", 0x00B5 },
360  { "para", 0x00B6 },
361  { "middot", 0x00B7 },
362  { "cedil", 0x00B8 },
363  { "sup1", 0x00B9 },
364  { "ordm", 0x00BA },
365  { "raquo", 0x00BB },
366  { "frac14", 0x00BC },
367  { "frac12", 0x00BD },
368  { "frac34", 0x00BE },
369  { "iquest", 0x00BF },
370  { "Agrave", 0x00C0 },
371  { "Aacute", 0x00C1 },
372  { "Acirc", 0x00C2 },
373  { "Atilde", 0x00C3 },
374  { "Auml", 0x00C4 },
375  { "Aring", 0x00C5 },
376  { "AElig", 0x00C6 },
377  { "Ccedil", 0x00C7 },
378  { "Egrave", 0x00C8 },
379  { "Eacute", 0x00C9 },
380  { "Ecirc", 0x00CA },
381  { "Euml", 0x00CB },
382  { "Igrave", 0x00CC },
383  { "Iacute", 0x00CD },
384  { "Icirc", 0x00CE },
385  { "Iuml", 0x00CF },
386  { "ETH", 0x00D0 },
387  { "Ntilde", 0x00D1 },
388  { "Ograve", 0x00D2 },
389  { "Oacute", 0x00D3 },
390  { "Ocirc", 0x00D4 },
391  { "Otilde", 0x00D5 },
392  { "Ouml", 0x00D6 },
393  { "times", 0x00D7 },
394  { "Oslash", 0x00D8 },
395  { "Ugrave", 0x00D9 },
396  { "Uacute", 0x00DA },
397  { "Ucirc", 0x00DB },
398  { "Uuml", 0x00DC },
399  { "Yacute", 0x00DD },
400  { "THORN", 0x00DE },
401  { "szlig", 0x00DF },
402  { "agrave", 0x00E0 },
403  { "aacute", 0x00E1 },
404  { "acirc", 0x00E2 },
405  { "atilde", 0x00E3 },
406  { "auml", 0x00E4 },
407  { "aring", 0x00E5 },
408  { "aelig", 0x00E6 },
409  { "ccedil", 0x00E7 },
410  { "egrave", 0x00E8 },
411  { "eacute", 0x00E9 },
412  { "ecirc", 0x00EA },
413  { "euml", 0x00EB },
414  { "igrave", 0x00EC },
415  { "iacute", 0x00ED },
416  { "icirc", 0x00EE },
417  { "iuml", 0x00EF },
418  { "eth", 0x00F0 },
419  { "ntilde", 0x00F1 },
420  { "ograve", 0x00F2 },
421  { "oacute", 0x00F3 },
422  { "ocirc", 0x00F4 },
423  { "otilde", 0x00F5 },
424  { "ouml", 0x00F6 },
425  { "divide", 0x00F7 },
426  { "oslash", 0x00F8 },
427  { "ugrave", 0x00F9 },
428  { "uacute", 0x00FA },
429  { "ucirc", 0x00FB },
430  { "uuml", 0x00FC },
431  { "yacute", 0x00FD },
432  { "thorn", 0x00FE },
433  { "yuml", 0x00FF },
434  { "OElig", 0x0152 },
435  { "oelig", 0x0153 },
436  { "Scaron", 0x0160 },
437  { "scaron", 0x0161 },
438  { "Yuml", 0x0178 },
439  { "fnof", 0x0192 },
440  { "circ", 0x02C6 },
441  { "tilde", 0x02DC },
442  { "Alpha", 0x0391 },
443  { "Beta", 0x0392 },
444  { "Gamma", 0x0393 },
445  { "Delta", 0x0394 },
446  { "Epsilon", 0x0395 },
447  { "Zeta", 0x0396 },
448  { "Eta", 0x0397 },
449  { "Theta", 0x0398 },
450  { "Iota", 0x0399 },
451  { "Kappa", 0x039A },
452  { "Lambda", 0x039B },
453  { "Mu", 0x039C },
454  { "Nu", 0x039D },
455  { "Xi", 0x039E },
456  { "Omicron", 0x039F },
457  { "Pi", 0x03A0 },
458  { "Rho", 0x03A1 },
459  { "Sigma", 0x03A3 },
460  { "Tau", 0x03A4 },
461  { "Upsilon", 0x03A5 },
462  { "Phi", 0x03A6 },
463  { "Chi", 0x03A7 },
464  { "Psi", 0x03A8 },
465  { "Omega", 0x03A9 },
466  { "alpha", 0x03B1 },
467  { "beta", 0x03B2 },
468  { "gamma", 0x03B3 },
469  { "delta", 0x03B4 },
470  { "epsilon", 0x03B5 },
471  { "zeta", 0x03B6 },
472  { "eta", 0x03B7 },
473  { "theta", 0x03B8 },
474  { "iota", 0x03B9 },
475  { "kappa", 0x03BA },
476  { "lambda", 0x03BB },
477  { "mu", 0x03BC },
478  { "nu", 0x03BD },
479  { "xi", 0x03BE },
480  { "omicron", 0x03BF },
481  { "pi", 0x03C0 },
482  { "rho", 0x03C1 },
483  { "sigmaf", 0x03C2 },
484  { "sigma", 0x03C3 },
485  { "tau", 0x03C4 },
486  { "upsilon", 0x03C5 },
487  { "phi", 0x03C6 },
488  { "chi", 0x03C7 },
489  { "psi", 0x03C8 },
490  { "omega", 0x03C9 },
491  { "thetasym", 0x03D1 },
492  { "upsih", 0x03D2 },
493  { "piv", 0x03D6 },
494  { "ensp", 0x2002 },
495  { "emsp", 0x2003 },
496  { "thinsp", 0x2009 },
497  { "zwnj", 0x200C },
498  { "zwj", 0x200D },
499  { "lrm", 0x200E },
500  { "rlm", 0x200F },
501  { "ndash", 0x2013 },
502  { "mdash", 0x2014 },
503  { "horbar", 0x2015 },
504  { "lsquo", 0x2018 },
505  { "rsquo", 0x2019 },
506  { "sbquo", 0x201A },
507  { "ldquo", 0x201C },
508  { "rdquo", 0x201D },
509  { "bdquo", 0x201E },
510  { "dagger", 0x2020 },
511  { "Dagger", 0x2021 },
512  { "bull", 0x2022 },
513  { "hellip", 0x2026 },
514  { "permil", 0x2030 },
515  { "prime", 0x2032 },
516  { "Prime", 0x2033 },
517  { "lsaquo", 0x2039 },
518  { "rsaquo", 0x203A },
519  { "oline", 0x203E },
520  { "frasl", 0x2044 },
521  { "euro", 0x20AC },
522  { "image", 0x2111 },
523  { "weierp", 0x2118 },
524  { "real", 0x211C },
525  { "trade", 0x2122 },
526  { "alefsym", 0x2135 },
527  { "larr", 0x2190 },
528  { "uarr", 0x2191 },
529  { "rarr", 0x2192 },
530  { "darr", 0x2193 },
531  { "harr", 0x2194 },
532  { "crarr", 0x21B5 },
533  { "lArr", 0x21D0 },
534  { "uArr", 0x21D1 },
535  { "rArr", 0x21D2 },
536  { "dArr", 0x21D3 },
537  { "hArr", 0x21D4 },
538  { "forall", 0x2200 },
539  { "part", 0x2202 },
540  { "exist", 0x2203 },
541  { "empty", 0x2205 },
542  { "nabla", 0x2207 },
543  { "isin", 0x2208 },
544  { "notin", 0x2209 },
545  { "ni", 0x220B },
546  { "prod", 0x220F },
547  { "sum", 0x2211 },
548  { "minus", 0x2212 },
549  { "lowast", 0x2217 },
550  { "radic", 0x221A },
551  { "prop", 0x221D },
552  { "infin", 0x221E },
553  { "ang", 0x2220 },
554  { "and", 0x2227 },
555  { "or", 0x2228 },
556  { "cap", 0x2229 },
557  { "cup", 0x222A },
558  { "int", 0x222B },
559  { "there4", 0x2234 },
560  { "sim", 0x223C },
561  { "cong", 0x2245 },
562  { "asymp", 0x2248 },
563  { "ne", 0x2260 },
564  { "equiv", 0x2261 },
565  { "le", 0x2264 },
566  { "ge", 0x2265 },
567  { "sub", 0x2282 },
568  { "sup", 0x2283 },
569  { "nsub", 0x2284 },
570  { "sube", 0x2286 },
571  { "supe", 0x2287 },
572  { "oplus", 0x2295 },
573  { "otimes", 0x2297 },
574  { "perp", 0x22A5 },
575  { "sdot", 0x22C5 },
576  { "lceil", 0x2308 },
577  { "rceil", 0x2309 },
578  { "lfloor", 0x230A },
579  { "rfloor", 0x230B },
580  { "lang", 0x2329 },
581  { "rang", 0x232A },
582  { "loz", 0x25CA },
583  { "spades", 0x2660 },
584  { "clubs", 0x2663 },
585  { "hearts", 0x2665 },
586  { "diams", 0x2666 }
587  }};
588 
589  for (auto &pair : names)
590  {
591  const RegEx re('&' + pair.first + ';');
592  re.subst(html, u8c.to_bytes(pair.second), RegEx::RE_GLOBAL);
593  }
594 
595  return html;
596  }

The documentation for this class was generated from the following files: