2020-02-16 22:19:44 +01:00
|
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
|
|
#include "fulltext.h"
|
|
|
|
|
#include "dive.h"
|
core: introduce divelog structure
The parser API was very annoying, as a number of tables
to-be-filled were passed in as pointers. The goal of this
commit is to collect all these tables in a single struct.
This should make it (more or less) clear what is actually
written into the divelog files.
Moreover, it should now be rather easy to search for
instances, where the global logfile is accessed (and it
turns out that there are many!).
The divelog struct does not contain the tables as substructs,
but only collects pointers. The idea is that the "divelog.h"
file can be included without all the other files describing
the numerous tables.
To make it easier to use from C++ parts of the code, the
struct implements a constructor and a destructor. Sadly,
we can't use smart pointers, since the pointers are accessed
from C code. Therfore the constructor and destructor are
quite complex.
The whole commit is large, but was mostly an automatic
conversion.
One oddity of note: the divelog structure also contains
the "autogroup" flag, since that is saved in the divelog.
This actually fixes a bug: Before, when importing dives
from a different log, the autogroup flag was overwritten.
This was probably not intended and does not happen anymore.
Signed-off-by: Berthold Stoeger <bstoeger@mail.tuwien.ac.at>
2022-11-08 21:31:08 +01:00
|
|
|
|
#include "divelog.h"
|
2020-02-16 22:19:44 +01:00
|
|
|
|
#include "divesite.h"
|
2020-05-16 20:10:53 +02:00
|
|
|
|
#include "tag.h"
|
2020-02-16 22:19:44 +01:00
|
|
|
|
#include "trip.h"
|
2020-04-01 07:36:31 -07:00
|
|
|
|
#include "qthelper.h"
|
2020-02-16 22:19:44 +01:00
|
|
|
|
#include <QLocale>
|
|
|
|
|
#include <map>
|
|
|
|
|
|
|
|
|
|
// This class caches each dives words, so that we can unregister a dive from the full text search
|
|
|
|
|
struct full_text_cache {
|
|
|
|
|
std::vector<QString> words;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// The FullText-search class
|
|
|
|
|
class FullText {
|
|
|
|
|
std::map<QString, std::vector<dive *>> words; // Dives that belong to each word
|
|
|
|
|
public:
|
2020-04-08 09:05:28 +02:00
|
|
|
|
void populate(); // Rebuild from current dive_table
|
2020-02-16 22:19:44 +01:00
|
|
|
|
void registerDive(struct dive *d); // Note: can be called repeatedly
|
|
|
|
|
void unregisterDive(struct dive *d); // Note: can be called repeatedly
|
|
|
|
|
void unregisterAll(); // Unregister all dives in the dive table
|
|
|
|
|
FullTextResult find(const FullTextQuery &q, StringFilterMode mode) const; // Find dives matchin all words.
|
|
|
|
|
private:
|
|
|
|
|
void registerWords(struct dive *d, const std::vector<QString> &w);
|
|
|
|
|
void unregisterWords(struct dive *d, const std::vector<QString> &w);
|
|
|
|
|
std::vector<dive *> findDives(const QString &s, StringFilterMode mode) const; // Find dives matching a given word.
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// This class doesn't depend on any other objects, we might just initialize it at startup.
|
|
|
|
|
static FullText self;
|
|
|
|
|
|
|
|
|
|
// C-interface functions
|
|
|
|
|
|
|
|
|
|
extern "C" {
|
|
|
|
|
|
|
|
|
|
void fulltext_register(struct dive *d)
|
|
|
|
|
{
|
|
|
|
|
self.registerDive(d);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void fulltext_unregister(struct dive *d)
|
|
|
|
|
{
|
|
|
|
|
self.unregisterDive(d);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void fulltext_unregister_all()
|
|
|
|
|
{
|
|
|
|
|
self.unregisterAll();
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-08 09:05:28 +02:00
|
|
|
|
void fulltext_populate()
|
2020-02-16 22:19:44 +01:00
|
|
|
|
{
|
2020-04-08 09:05:28 +02:00
|
|
|
|
self.populate();
|
2020-02-16 22:19:44 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // extern "C"
|
|
|
|
|
|
|
|
|
|
// C++-only interface functions
|
|
|
|
|
FullTextResult fulltext_find_dives(const FullTextQuery &q, StringFilterMode mode)
|
|
|
|
|
{
|
|
|
|
|
return self.find(q, mode);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check whether a single dive matches the fulltext criterion
|
|
|
|
|
bool fulltext_dive_matches(const struct dive *d, const FullTextQuery &q, StringFilterMode mode)
|
|
|
|
|
{
|
|
|
|
|
if (!q.doit())
|
|
|
|
|
return true;
|
|
|
|
|
if (!d->full_text)
|
|
|
|
|
return false;
|
|
|
|
|
auto matchFunc =
|
|
|
|
|
mode == StringFilterMode::EXACT ? [](const QString &s1, const QString &s2) { return s1 == s2; } :
|
|
|
|
|
mode == StringFilterMode::STARTSWITH ? [](const QString &s1, const QString &s2) { return s1.startsWith(s2); } :
|
|
|
|
|
/* mode == StringFilterMode::SUBSTRING ? */ [](const QString &s1, const QString &s2) { return s1.contains(s2); };
|
|
|
|
|
const std::vector<QString> &words = d->full_text->words;
|
|
|
|
|
for (const QString &search: q.words) {
|
|
|
|
|
if (std::any_of(words.begin(), words.end(), [&search,matchFunc](const QString &w) { return matchFunc(w, search); }))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Class implementation
|
|
|
|
|
|
2022-06-18 15:09:45 -04:00
|
|
|
|
// Take a text and tokenize it into words. Normalize the words to the base
|
|
|
|
|
// upper case base character (e.g. 'ℓ' to 'L') and add to a given list,
|
|
|
|
|
// if not already in list.
|
2020-02-16 22:19:44 +01:00
|
|
|
|
// We might think about limiting the lower size of words we store.
|
|
|
|
|
// Note: we convert to QString before tokenization because we rely in
|
|
|
|
|
// Qt's isPunct() function.
|
|
|
|
|
static void tokenize(QString s, std::vector<QString> &res)
|
|
|
|
|
{
|
|
|
|
|
if (s.isEmpty())
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
QLocale loc;
|
|
|
|
|
int size = s.size();
|
|
|
|
|
int pos = 0;
|
|
|
|
|
while (pos < size) {
|
|
|
|
|
// Skip whitespace and punctuation
|
|
|
|
|
while (s[pos].isSpace() || s[pos].isPunct()) {
|
|
|
|
|
if (++pos >= size)
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
int end = pos;
|
|
|
|
|
while (end < size && !s[end].isSpace() && !s[end].isPunct())
|
|
|
|
|
++end;
|
2022-06-18 15:09:45 -04:00
|
|
|
|
QString word = s.mid(pos, end - pos);
|
|
|
|
|
word = word.normalized(QString::NormalizationForm_KD);
|
|
|
|
|
word = loc.toUpper(word);
|
2020-02-16 22:19:44 +01:00
|
|
|
|
pos = end;
|
|
|
|
|
|
|
|
|
|
if (find(res.begin(), res.end(), word) == res.end())
|
|
|
|
|
res.push_back(word);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Get all words of a dive
|
|
|
|
|
static std::vector<QString> getWords(const dive *d)
|
|
|
|
|
{
|
|
|
|
|
std::vector<QString> res;
|
|
|
|
|
tokenize(QString(d->notes), res);
|
2022-02-12 14:03:18 +01:00
|
|
|
|
tokenize(QString(d->diveguide), res);
|
2020-02-16 22:19:44 +01:00
|
|
|
|
tokenize(QString(d->buddy), res);
|
|
|
|
|
tokenize(QString(d->suit), res);
|
2020-05-16 20:10:53 +02:00
|
|
|
|
for (const tag_entry *tag = d->tag_list; tag; tag = tag->next)
|
|
|
|
|
tokenize(QString(tag->tag->name), res);
|
2020-02-16 22:19:44 +01:00
|
|
|
|
for (int i = 0; i < d->cylinders.nr; ++i) {
|
2020-08-20 07:31:04 +02:00
|
|
|
|
const cylinder_t &cyl = *get_cylinder(d, i);
|
2020-02-16 22:19:44 +01:00
|
|
|
|
tokenize(QString(cyl.type.description), res);
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < d->weightsystems.nr; ++i) {
|
|
|
|
|
const weightsystem_t &ws = d->weightsystems.weightsystems[i];
|
|
|
|
|
tokenize(QString(ws.description), res);
|
|
|
|
|
}
|
|
|
|
|
// TODO: We should tokenize all dive-sites and trips first and then
|
|
|
|
|
// take the tokens from a cache.
|
2024-03-15 12:36:56 +13:00
|
|
|
|
if (d->dive_site) {
|
2020-02-16 22:19:44 +01:00
|
|
|
|
tokenize(d->dive_site->name, res);
|
2024-03-15 12:36:56 +13:00
|
|
|
|
const char *country = taxonomy_get_country(&d->dive_site->taxonomy);
|
|
|
|
|
if (country)
|
|
|
|
|
tokenize(country, res);
|
|
|
|
|
}
|
2020-02-16 22:19:44 +01:00
|
|
|
|
// TODO: We should index trips separately!
|
|
|
|
|
if (d->divetrip)
|
|
|
|
|
tokenize(d->divetrip->location, res);
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-08 09:05:28 +02:00
|
|
|
|
void FullText::populate()
|
2020-02-16 22:19:44 +01:00
|
|
|
|
{
|
2020-04-01 07:36:31 -07:00
|
|
|
|
// we want this to be two calls as the second text is overwritten below by the lines starting with "\r"
|
|
|
|
|
uiNotification(QObject::tr("Create full text index"));
|
|
|
|
|
uiNotification(QObject::tr("start processing"));
|
2020-02-16 22:19:44 +01:00
|
|
|
|
int i;
|
|
|
|
|
dive *d;
|
2020-04-13 10:42:42 +02:00
|
|
|
|
for_each_dive(i, d)
|
2020-02-16 22:19:44 +01:00
|
|
|
|
registerDive(d);
|
core: introduce divelog structure
The parser API was very annoying, as a number of tables
to-be-filled were passed in as pointers. The goal of this
commit is to collect all these tables in a single struct.
This should make it (more or less) clear what is actually
written into the divelog files.
Moreover, it should now be rather easy to search for
instances, where the global logfile is accessed (and it
turns out that there are many!).
The divelog struct does not contain the tables as substructs,
but only collects pointers. The idea is that the "divelog.h"
file can be included without all the other files describing
the numerous tables.
To make it easier to use from C++ parts of the code, the
struct implements a constructor and a destructor. Sadly,
we can't use smart pointers, since the pointers are accessed
from C code. Therfore the constructor and destructor are
quite complex.
The whole commit is large, but was mostly an automatic
conversion.
One oddity of note: the divelog structure also contains
the "autogroup" flag, since that is saved in the divelog.
This actually fixes a bug: Before, when importing dives
from a different log, the autogroup flag was overwritten.
This was probably not intended and does not happen anymore.
Signed-off-by: Berthold Stoeger <bstoeger@mail.tuwien.ac.at>
2022-11-08 21:31:08 +01:00
|
|
|
|
uiNotification(QObject::tr("%1 dives processed").arg(divelog.dives->nr));
|
2020-02-16 22:19:44 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void FullText::registerDive(struct dive *d)
|
|
|
|
|
{
|
2022-06-18 15:09:45 -04:00
|
|
|
|
if (d->full_text)
|
2020-02-16 22:19:44 +01:00
|
|
|
|
unregisterWords(d, d->full_text->words);
|
2022-06-18 15:09:45 -04:00
|
|
|
|
else
|
2020-02-16 22:19:44 +01:00
|
|
|
|
d->full_text = new full_text_cache;
|
|
|
|
|
d->full_text->words = getWords(d);
|
|
|
|
|
registerWords(d, d->full_text->words);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void FullText::unregisterDive(struct dive *d)
|
|
|
|
|
{
|
|
|
|
|
if (!d->full_text)
|
|
|
|
|
return;
|
|
|
|
|
unregisterWords(d, d->full_text->words);
|
|
|
|
|
delete d->full_text;
|
|
|
|
|
d->full_text = nullptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void FullText::unregisterAll()
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
dive *d;
|
|
|
|
|
for_each_dive(i, d) {
|
|
|
|
|
delete d->full_text;
|
|
|
|
|
d->full_text = nullptr;
|
|
|
|
|
}
|
|
|
|
|
words.clear();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Register words of a dive.
|
|
|
|
|
void FullText::registerWords(struct dive *d, const std::vector<QString> &w)
|
|
|
|
|
{
|
|
|
|
|
for (const QString &word: w) {
|
|
|
|
|
std::vector<dive *> &entry = words[word];
|
|
|
|
|
if (std::find(entry.begin(), entry.end(), d) == entry.end())
|
|
|
|
|
entry.push_back(d);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Unregister words of a dive.
|
|
|
|
|
void FullText::unregisterWords(struct dive *d, const std::vector<QString> &w)
|
|
|
|
|
{
|
|
|
|
|
for (const QString &word: w) {
|
|
|
|
|
auto it = words.find(word);
|
|
|
|
|
if (it == words.end()) {
|
|
|
|
|
qWarning("FullText::unregisterWords: didn't find word '%s' in index!?", qPrintable(word));
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
std::vector<dive *> &entry = it->second;
|
|
|
|
|
entry.erase(std::remove(entry.begin(), entry.end(), d));
|
|
|
|
|
if (entry.empty())
|
|
|
|
|
words.erase(it);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Add dives from second array to first, if not yet there
|
|
|
|
|
void combineDives(std::vector<dive *> &to, const std::vector<dive *> &from)
|
|
|
|
|
{
|
|
|
|
|
for (dive *d: from) {
|
|
|
|
|
if (std::find(to.begin(), to.end(), d) == to.end())
|
|
|
|
|
to.push_back(d);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<dive *> FullText::findDives(const QString &s, StringFilterMode mode) const
|
|
|
|
|
{
|
|
|
|
|
switch (mode) {
|
|
|
|
|
case StringFilterMode::EXACT:
|
|
|
|
|
default: {
|
|
|
|
|
// Try to access a single word
|
|
|
|
|
auto it = words.find(s);
|
|
|
|
|
if (it == words.end())
|
|
|
|
|
return {};
|
|
|
|
|
return it->second;
|
|
|
|
|
}
|
|
|
|
|
case StringFilterMode::STARTSWITH: {
|
|
|
|
|
// Find all words that start with a substring. We use the fact
|
|
|
|
|
// that these words must form a contiguous block, since the words are
|
|
|
|
|
// ordered lexicographically.
|
|
|
|
|
auto it = words.lower_bound(s);
|
|
|
|
|
if (it == words.end() || !it->first.startsWith(s))
|
|
|
|
|
return {};
|
|
|
|
|
std::vector<dive *> res = it->second;
|
|
|
|
|
++it;
|
|
|
|
|
while (it != words.end() && it->first.startsWith(s)) {
|
|
|
|
|
combineDives(res, it->second);
|
|
|
|
|
++it;
|
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
case StringFilterMode::SUBSTRING: {
|
|
|
|
|
// Find all words that contain a substring. Here, we have to check all words!
|
|
|
|
|
std::vector<dive *> res;
|
|
|
|
|
for (auto it = words.begin(); it != words.end(); ++it) {
|
|
|
|
|
if (it->first.contains(s))
|
|
|
|
|
combineDives(res, it->second);
|
|
|
|
|
}
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
FullTextResult FullText::find(const FullTextQuery &q, StringFilterMode mode) const
|
|
|
|
|
{
|
|
|
|
|
if (q.words.empty())
|
|
|
|
|
return FullTextResult();
|
|
|
|
|
|
|
|
|
|
std::vector<dive *> res = findDives(q.words[0], mode);
|
|
|
|
|
for (size_t i = 1; i < q.words.size(); ++i) {
|
|
|
|
|
std::vector<dive *> res2 = findDives(q.words[i], mode);
|
|
|
|
|
// Remove dives from res that are not in res2
|
|
|
|
|
res.erase(std::remove_if(res.begin(), res.end(),
|
|
|
|
|
[&res2] (dive *d) { return std::find(res2.begin(), res2.end(), d) == res2.end(); }), res.end());
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-16 17:39:19 +01:00
|
|
|
|
return { std::move(res) };
|
2020-02-16 22:19:44 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
FullTextQuery &FullTextQuery::operator=(const QString &s)
|
|
|
|
|
{
|
2020-05-29 08:26:38 +02:00
|
|
|
|
originalQuery = s;
|
2020-02-16 22:19:44 +01:00
|
|
|
|
words.clear();
|
|
|
|
|
tokenize(s, words);
|
|
|
|
|
return *this;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool FullTextQuery::doit() const
|
|
|
|
|
{
|
|
|
|
|
return !words.empty();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool FullTextResult::dive_matches(const struct dive *d) const
|
|
|
|
|
{
|
|
|
|
|
return std::find(dives.begin(), dives.end(), d) != dives.end();
|
|
|
|
|
}
|