Compare commits

..

6 commits

Author SHA1 Message Date
Imbus
7a62bebf76 Initial implementation of matcher 2024-11-21 08:50:09 +01:00
Imbus
70170ea995 Formatting 2024-11-21 08:49:45 +01:00
Imbus
d10300509e Formatting 2024-11-21 08:47:51 +01:00
Imbus
7dd7f5610b Expose triagrams from word 2024-11-21 08:47:45 +01:00
Imbus
94d807fc67 Type casting fixes and bounding array access 2024-11-21 08:46:55 +01:00
Imbus
8c8930f5c5 Makefile targets for linting 2024-11-21 08:45:34 +01:00
8 changed files with 118 additions and 36 deletions

View file

@ -3,6 +3,7 @@ CXXFLAGS = -Wall -Wextra -Wpedantic -Wshadow -Wnon-virtual-dtor -Wold-style-cast
#CXXFLAGS += -Werror #CXXFLAGS += -Werror
SRC = $(wildcard *.cc) SRC = $(wildcard *.cc)
HDR = $(wildcard *.h)
OBJ = $(SRC:.cc=.o) OBJ = $(SRC:.cc=.o)
all: spell edit $(OBJ) all: spell edit $(OBJ)
@ -19,7 +20,18 @@ spell: spell.o word.o dictionary.o
@echo "Building $@" @echo "Building $@"
@$(CXX) -c $(CXXFLAGS) $< -o $@ @$(CXX) -c $(CXXFLAGS) $< -o $@
lint: clang-tidy cppcheck clang-format
clang-tidy:
clang-tidy $(SRC) -- $(CXXFLAGS)
cppcheck:
cppcheck --enable=all --language=c++ --std=c++17 --suppress=missingIncludeSystem -I/usr/include $(SRC) $(HDR)
clang-format:
clang-format -i $(SRC) $(HDR)
clean: clean:
rm -f *.o spell edit rm -f *.o spell edit
.PHONY: clean .PHONY: clean all lint clang-tidy cppcheck clang-format

View file

@ -1,9 +1,9 @@
#include "dictionary.h" #include "dictionary.h"
#include "word.h" #include "word.h"
#include <algorithm> #include <algorithm>
#include <filesystem>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <set>
#include <string> #include <string>
#include <vector> #include <vector>
@ -13,7 +13,7 @@ using std::vector;
Dictionary::Dictionary() {} Dictionary::Dictionary() {}
bool Dictionary::contains(const string &word) const { bool Dictionary::contains(const string &word) const {
int l = word.length(); auto l = word.length();
Word w = Word(word); Word w = Word(word);
if (std::find(this->words[l].begin(), this->words[l].end(), w) != if (std::find(this->words[l].begin(), this->words[l].end(), w) !=
std::end(this->words[l])) { std::end(this->words[l])) {
@ -22,14 +22,77 @@ bool Dictionary::contains(const string &word) const {
return false; return false;
} }
vector<string> Dictionary::get_suggestions(const string &word) const { std::vector<string> Dictionary::get_suggestions(const string &word) const {
vector<string> suggestions; vector<string> suggestions;
// add_trigram_suggestions(suggestions, word); add_trigram_suggestions(suggestions, word);
// rank_suggestions(suggestions, word); rank_suggestions(suggestions, word);
// trim_suggestions(suggestions); trim_suggestions(suggestions, word);
return suggestions; return suggestions;
} }
void Dictionary::add_trigram_suggestions(std::vector<std::string> &suggestions,
const std::string &word) const {
// Get trigrams of the input word
Word input_word(word);
const std::vector<std::string> &input_trigrams = input_word.get_triagrams();
// Iterate through all words in the dictionary
for (int i = 0; i < MAXLEN; ++i) {
for (const Word &dict_word : words[i]) {
// Get the trigrams of the dictionary word
const std::vector<std::string> &dict_word_trigrams =
dict_word.get_triagrams();
// Count how many trigrams match
unsigned int match_count = dict_word.get_matches(input_trigrams);
// If there are any matches, add the word to suggestions
if (match_count > 0) {
suggestions.push_back(dict_word.get_word());
}
}
}
}
void Dictionary::rank_suggestions(std::vector<std::string> &suggestions,
const std::string &word) const {
// Get trigrams of the input word
Word input_word(word);
const std::vector<std::string> &input_trigrams = input_word.get_triagrams();
// Sort suggestions based on the number of matching trigrams
std::sort(suggestions.begin(), suggestions.end(),
[&](const std::string &a, const std::string &b) {
Word word_a(a);
Word word_b(b);
unsigned int match_a = word_a.get_matches(input_trigrams);
unsigned int match_b = word_b.get_matches(input_trigrams);
return match_a >
match_b; // Sort in descending order of match count
});
}
void Dictionary::trim_suggestions(std::vector<std::string> &suggestions,
const std::string &word) const {
// Remove duplicates using a set
std::set<std::string> unique_suggestions(suggestions.begin(),
suggestions.end());
suggestions.assign(unique_suggestions.begin(), unique_suggestions.end());
// Remove the input word from the suggestions list (if present)
suggestions.erase(std::remove(suggestions.begin(), suggestions.end(), word),
suggestions.end());
// Example: Remove any suggestions that are too short
suggestions.erase(
std::remove_if(suggestions.begin(), suggestions.end(),
[](const std::string &s) {
return s.length() <
3; // Remove words shorter than 3 characters
}),
suggestions.end());
}
int Dictionary::spit(path p) { int Dictionary::spit(path p) {
std::ofstream file(p); std::ofstream file(p);
@ -60,9 +123,9 @@ int Dictionary::slurp(path p) {
std::string line; std::string line;
while (std::getline(file, line)) { while (std::getline(file, line)) {
if (line.size() > MAXLEN) // Words larger than max gets placed in the topmost bucket
continue; words[std::min(line.size(), static_cast<size_t>(MAXLEN) - 1)].push_back(
words[line.size()].push_back(Word(line)); Word(line));
} }
file.close(); file.close();

View file

@ -1,5 +1,4 @@
#ifndef DICTIONARY_H #pragma once
#define DICTIONARY_H
#include "word.h" #include "word.h"
#include <filesystem> #include <filesystem>
@ -8,19 +7,23 @@
#define MAXLEN 30 #define MAXLEN 30
using std::vector; // using std::vector;
using std::filesystem::path; using std::filesystem::path;
class Dictionary { class Dictionary {
public: public:
Dictionary(); Dictionary();
void add_trigram_suggestions(std::vector<std::string> &suggestions,
const std::string &word) const;
void rank_suggestions(std::vector<std::string> &suggestions,
const std::string &word) const;
void trim_suggestions(std::vector<std::string> &suggestions,
const std::string &word) const;
bool contains(const std::string &word) const; bool contains(const std::string &word) const;
std::vector<std::string> get_suggestions(const std::string &word) const; std::vector<std::string> get_suggestions(const std::string &word) const;
int slurp(path p); int slurp(path p);
int spit(path p); int spit(path p);
private: private:
vector<Word> words[MAXLEN]; std::vector<Word> words[MAXLEN];
}; };
#endif

View file

@ -1,14 +1,13 @@
#include <iostream>
#include <vector>
#include <string>
#include <algorithm> #include <algorithm>
#include <string>
#include <vector>
int edit_distance(const std::string& s1, const std::string& s2) { int edit_distance(const std::string &s1, const std::string &s2) {
size_t m = s1.size(); size_t m = s1.size();
size_t n = s2.size(); size_t n = s2.size();
// Create a 2D DP table // Create a 2D DP table
std::vector<std::vector<int>> dp(m + 1, std::vector<int>(n + 1)); std::vector<std::vector<size_t>> dp(m + 1, std::vector<size_t>(n + 1));
// Fill the base cases // Fill the base cases
for (size_t i = 0; i <= m; ++i) for (size_t i = 0; i <= m; ++i)
@ -23,13 +22,14 @@ int edit_distance(const std::string& s1, const std::string& s2) {
if (s1[i - 1] == s2[j - 1]) { if (s1[i - 1] == s2[j - 1]) {
dp[i][j] = dp[i - 1][j - 1]; // No operation needed dp[i][j] = dp[i - 1][j - 1]; // No operation needed
} else { } else {
dp[i][j] = 1 + std::min({dp[i - 1][j], // Deletion dp[i][j] = 1 + std::min({
dp[i][j - 1], // Insertion dp[i - 1][j], // Deletion
dp[i - 1][j - 1] // Substitution dp[i][j - 1], // Insertion
}); dp[i - 1][j - 1] // Substitution
});
} }
} }
} }
return dp[m][n]; return static_cast<int>(dp[m][n]);
} }

View file

@ -1,18 +1,17 @@
#include <algorithm>
#include <iostream>
#include <string> #include <string>
#include <vector>
/** /**
* @brief Computes the edit distance (Levenshtein distance) between two strings. * @brief Computes the edit distance (Levenshtein distance) between two strings.
* *
* The edit distance is defined as the minimum number of single-character edits * The edit distance is defined as the minimum number of single-character edits
* (insertions, deletions, or substitutions) required to transform one string into the other. * (insertions, deletions, or substitutions) required to transform one string
* into the other.
* *
* This implementation uses dynamic programming to compute the distance efficiently. * This implementation uses dynamic programming to compute the distance
* efficiently.
* *
* @param s1 The first string. * @param s1 The first string.
* @param s2 The second string. * @param s2 The second string.
* @return The edit distance between the two strings. * @return The edit distance between the two strings.
*/ */
int edit_distance(const std::string& s1, const std::string& s2); int edit_distance(const std::string &s1, const std::string &s2);

View file

@ -8,8 +8,7 @@
#include <iostream> #include <iostream>
bool do_test(const std::string& x, const std::string& y, int expected) bool do_test(const std::string &x, const std::string &y, int expected) {
{
auto actual = edit_distance(x, y); auto actual = edit_distance(x, y);
if (actual != expected) { if (actual != expected) {
std::cout << "*** WRONG: distance(" << x << ", " << y << ") was " std::cout << "*** WRONG: distance(" << x << ", " << y << ") was "
@ -19,8 +18,7 @@ bool do_test(const std::string& x, const std::string& y, int expected)
return false; return false;
} }
int main() int main() {
{
int res = do_test("foobar", "foobar", 0); int res = do_test("foobar", "foobar", 0);
res += do_test("x", "x", 0); res += do_test("x", "x", 0);
res += do_test("baz", "bar", 1); res += do_test("baz", "bar", 1);

View file

@ -32,6 +32,10 @@ Word::Word(const std::string &w) : word(w) {
string Word::get_word() const { return string(); } string Word::get_word() const { return string(); }
vector<std::string> Word::get_triagrams() const {
return triagrams;
}
unsigned int Word::get_matches(const vector<string> &t) const { unsigned int Word::get_matches(const vector<string> &t) const {
unsigned int matches = 0; unsigned int matches = 0;

View file

@ -17,6 +17,9 @@ class Word {
/** Returns the word */ /** Returns the word */
std::string get_word() const; std::string get_word() const;
/** Returns triagrams */
std::vector<std::string> get_triagrams() const;
/** Returns how many of the trigrams in t that are present /** Returns how many of the trigrams in t that are present
in this word's trigram vector */ in this word's trigram vector */
unsigned int get_matches(const std::vector<std::string> &t) const; unsigned int get_matches(const std::vector<std::string> &t) const;