mirror of
https://github.com/subsurface/subsurface.git
synced 2024-11-28 05:00:20 +00:00
f18acf6fb9
Also adds a new test, which tests merging of two tag-lists. Signed-off-by: Berthold Stoeger <bstoeger@mail.tuwien.ac.at>
293 lines
8 KiB
C++
293 lines
8 KiB
C++
// SPDX-License-Identifier: GPL-2.0
|
||
|
||
#include "fulltext.h"
|
||
#include "dive.h"
|
||
#include "divelog.h"
|
||
#include "divesite.h"
|
||
#include "tag.h"
|
||
#include "trip.h"
|
||
#include "qthelper.h"
|
||
#include <QLocale>
|
||
#include <map>
|
||
|
||
// This class caches each dives words, so that we can unregister a dive from the full text search
|
||
struct full_text_cache {
|
||
std::vector<QString> words;
|
||
};
|
||
|
||
// The FullText-search class
|
||
class FullText {
|
||
std::map<QString, std::vector<dive *>> words; // Dives that belong to each word
|
||
public:
|
||
void populate(); // Rebuild from current dive_table
|
||
void registerDive(struct dive *d); // Note: can be called repeatedly
|
||
void unregisterDive(struct dive *d); // Note: can be called repeatedly
|
||
void unregisterAll(); // Unregister all dives in the dive table
|
||
FullTextResult find(const FullTextQuery &q, StringFilterMode mode) const; // Find dives matchin all words.
|
||
private:
|
||
void registerWords(struct dive *d, const std::vector<QString> &w);
|
||
void unregisterWords(struct dive *d, const std::vector<QString> &w);
|
||
std::vector<dive *> findDives(const QString &s, StringFilterMode mode) const; // Find dives matching a given word.
|
||
};
|
||
|
||
// This class doesn't depend on any other objects, we might just initialize it at startup.
|
||
static FullText self;
|
||
|
||
// C-interface functions
|
||
|
||
void fulltext_register(struct dive *d)
|
||
{
|
||
self.registerDive(d);
|
||
}
|
||
|
||
void fulltext_unregister(struct dive *d)
|
||
{
|
||
self.unregisterDive(d);
|
||
}
|
||
|
||
void fulltext_unregister_all()
|
||
{
|
||
self.unregisterAll();
|
||
}
|
||
|
||
void fulltext_populate()
|
||
{
|
||
self.populate();
|
||
}
|
||
|
||
// C++-only interface functions
|
||
FullTextResult fulltext_find_dives(const FullTextQuery &q, StringFilterMode mode)
|
||
{
|
||
return self.find(q, mode);
|
||
}
|
||
|
||
// Check whether a single dive matches the fulltext criterion
|
||
bool fulltext_dive_matches(const struct dive *d, const FullTextQuery &q, StringFilterMode mode)
|
||
{
|
||
if (!q.doit())
|
||
return true;
|
||
if (!d->full_text)
|
||
return false;
|
||
auto matchFunc =
|
||
mode == StringFilterMode::EXACT ? [](const QString &s1, const QString &s2) { return s1 == s2; } :
|
||
mode == StringFilterMode::STARTSWITH ? [](const QString &s1, const QString &s2) { return s1.startsWith(s2); } :
|
||
/* mode == StringFilterMode::SUBSTRING ? */ [](const QString &s1, const QString &s2) { return s1.contains(s2); };
|
||
const std::vector<QString> &words = d->full_text->words;
|
||
for (const QString &search: q.words) {
|
||
if (std::any_of(words.begin(), words.end(), [&search,matchFunc](const QString &w) { return matchFunc(w, search); }))
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
// Class implementation
|
||
|
||
// Take a text and tokenize it into words. Normalize the words to the base
|
||
// upper case base character (e.g. 'ℓ' to 'L') and add to a given list,
|
||
// if not already in list.
|
||
// We might think about limiting the lower size of words we store.
|
||
// Note: we convert to QString before tokenization because we rely in
|
||
// Qt's isPunct() function.
|
||
static void tokenize(QString s, std::vector<QString> &res)
|
||
{
|
||
if (s.isEmpty())
|
||
return;
|
||
|
||
QLocale loc;
|
||
int size = s.size();
|
||
int pos = 0;
|
||
while (pos < size) {
|
||
// Skip whitespace and punctuation
|
||
while (s[pos].isSpace() || s[pos].isPunct()) {
|
||
if (++pos >= size)
|
||
return;
|
||
}
|
||
int end = pos;
|
||
while (end < size && !s[end].isSpace() && !s[end].isPunct())
|
||
++end;
|
||
QString word = s.mid(pos, end - pos);
|
||
word = word.normalized(QString::NormalizationForm_KD);
|
||
word = loc.toUpper(word);
|
||
pos = end;
|
||
|
||
if (find(res.begin(), res.end(), word) == res.end())
|
||
res.push_back(word);
|
||
}
|
||
}
|
||
|
||
// Get all words of a dive
|
||
static std::vector<QString> getWords(const dive *d)
|
||
{
|
||
std::vector<QString> res;
|
||
tokenize(QString(d->notes), res);
|
||
tokenize(QString(d->diveguide), res);
|
||
tokenize(QString(d->buddy), res);
|
||
tokenize(QString(d->suit), res);
|
||
for (const divetag *tag: d->tags)
|
||
tokenize(QString::fromStdString(tag->name), res);
|
||
for (auto &cyl: d->cylinders)
|
||
tokenize(QString::fromStdString(cyl.type.description), res);
|
||
for (auto &ws: d->weightsystems)
|
||
tokenize(QString::fromStdString(ws.description), res);
|
||
// TODO: We should tokenize all dive-sites and trips first and then
|
||
// take the tokens from a cache.
|
||
if (d->dive_site) {
|
||
tokenize(QString::fromStdString(d->dive_site->name), res);
|
||
std::string country = taxonomy_get_country(d->dive_site->taxonomy);
|
||
if (!country.empty())
|
||
tokenize(country.c_str(), res);
|
||
}
|
||
// TODO: We should index trips separately!
|
||
if (d->divetrip)
|
||
tokenize(d->divetrip->location, res);
|
||
return res;
|
||
}
|
||
|
||
void FullText::populate()
|
||
{
|
||
// we want this to be two calls as the second text is overwritten below by the lines starting with "\r"
|
||
uiNotification(QObject::tr("Create full text index"));
|
||
uiNotification(QObject::tr("start processing"));
|
||
int i;
|
||
dive *d;
|
||
for_each_dive(i, d)
|
||
registerDive(d);
|
||
uiNotification(QObject::tr("%1 dives processed").arg(divelog.dives->nr));
|
||
}
|
||
|
||
void FullText::registerDive(struct dive *d)
|
||
{
|
||
if (d->full_text)
|
||
unregisterWords(d, d->full_text->words);
|
||
else
|
||
d->full_text = new full_text_cache;
|
||
d->full_text->words = getWords(d);
|
||
registerWords(d, d->full_text->words);
|
||
}
|
||
|
||
void FullText::unregisterDive(struct dive *d)
|
||
{
|
||
if (!d->full_text)
|
||
return;
|
||
unregisterWords(d, d->full_text->words);
|
||
delete d->full_text;
|
||
d->full_text = nullptr;
|
||
}
|
||
|
||
void FullText::unregisterAll()
|
||
{
|
||
int i;
|
||
dive *d;
|
||
for_each_dive(i, d) {
|
||
delete d->full_text;
|
||
d->full_text = nullptr;
|
||
}
|
||
words.clear();
|
||
}
|
||
|
||
// Register words of a dive.
|
||
void FullText::registerWords(struct dive *d, const std::vector<QString> &w)
|
||
{
|
||
for (const QString &word: w) {
|
||
std::vector<dive *> &entry = words[word];
|
||
if (std::find(entry.begin(), entry.end(), d) == entry.end())
|
||
entry.push_back(d);
|
||
}
|
||
}
|
||
|
||
// Unregister words of a dive.
|
||
void FullText::unregisterWords(struct dive *d, const std::vector<QString> &w)
|
||
{
|
||
for (const QString &word: w) {
|
||
auto it = words.find(word);
|
||
if (it == words.end()) {
|
||
qWarning("FullText::unregisterWords: didn't find word '%s' in index!?", qPrintable(word));
|
||
continue;
|
||
}
|
||
std::vector<dive *> &entry = it->second;
|
||
entry.erase(std::remove(entry.begin(), entry.end(), d));
|
||
if (entry.empty())
|
||
words.erase(it);
|
||
}
|
||
}
|
||
|
||
// Add dives from second array to first, if not yet there
|
||
void combineDives(std::vector<dive *> &to, const std::vector<dive *> &from)
|
||
{
|
||
for (dive *d: from) {
|
||
if (std::find(to.begin(), to.end(), d) == to.end())
|
||
to.push_back(d);
|
||
}
|
||
}
|
||
|
||
std::vector<dive *> FullText::findDives(const QString &s, StringFilterMode mode) const
|
||
{
|
||
switch (mode) {
|
||
case StringFilterMode::EXACT:
|
||
default: {
|
||
// Try to access a single word
|
||
auto it = words.find(s);
|
||
if (it == words.end())
|
||
return {};
|
||
return it->second;
|
||
}
|
||
case StringFilterMode::STARTSWITH: {
|
||
// Find all words that start with a substring. We use the fact
|
||
// that these words must form a contiguous block, since the words are
|
||
// ordered lexicographically.
|
||
auto it = words.lower_bound(s);
|
||
if (it == words.end() || !it->first.startsWith(s))
|
||
return {};
|
||
std::vector<dive *> res = it->second;
|
||
++it;
|
||
while (it != words.end() && it->first.startsWith(s)) {
|
||
combineDives(res, it->second);
|
||
++it;
|
||
}
|
||
return res;
|
||
}
|
||
case StringFilterMode::SUBSTRING: {
|
||
// Find all words that contain a substring. Here, we have to check all words!
|
||
std::vector<dive *> res;
|
||
for (auto it = words.begin(); it != words.end(); ++it) {
|
||
if (it->first.contains(s))
|
||
combineDives(res, it->second);
|
||
}
|
||
return res;
|
||
}
|
||
}
|
||
}
|
||
|
||
FullTextResult FullText::find(const FullTextQuery &q, StringFilterMode mode) const
|
||
{
|
||
if (q.words.empty())
|
||
return FullTextResult();
|
||
|
||
std::vector<dive *> res = findDives(q.words[0], mode);
|
||
for (size_t i = 1; i < q.words.size(); ++i) {
|
||
std::vector<dive *> res2 = findDives(q.words[i], mode);
|
||
// Remove dives from res that are not in res2
|
||
res.erase(std::remove_if(res.begin(), res.end(),
|
||
[&res2] (dive *d) { return std::find(res2.begin(), res2.end(), d) == res2.end(); }), res.end());
|
||
}
|
||
|
||
return { std::move(res) };
|
||
}
|
||
|
||
FullTextQuery &FullTextQuery::operator=(const QString &s)
|
||
{
|
||
originalQuery = s;
|
||
words.clear();
|
||
tokenize(s, words);
|
||
return *this;
|
||
}
|
||
|
||
bool FullTextQuery::doit() const
|
||
{
|
||
return !words.empty();
|
||
}
|
||
|
||
bool FullTextResult::dive_matches(const struct dive *d) const
|
||
{
|
||
return std::find(dives.begin(), dives.end(), d) != dives.end();
|
||
}
|