Skip to content

Commit

Permalink
Merge pull request #48 from BianchTech/46-refactor-move-string-manipu…
Browse files Browse the repository at this point in the history
…lation-code-to-a-utility-folder

refactor(string_operations.h/cpp): add new util operations
  • Loading branch information
pedrobiqua authored Jan 27, 2025
2 parents 32d3202 + 1dda89e commit 438da18
Show file tree
Hide file tree
Showing 7 changed files with 116 additions and 129 deletions.
1 change: 1 addition & 0 deletions lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ add_library(
src/inverted_index.cpp
src/preprocessing/stemmer.cpp
src/exceptions/invalid_pointer_exception.cpp
src/utils/string_operations.cpp
)

target_include_directories(search_engine PUBLIC include)
Expand Down
30 changes: 2 additions & 28 deletions lib/include/inverted_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <string>
#include <vector>

#include "utils/string_operations.h"

namespace inverted_index {
/**
* @typedef str
Expand All @@ -22,12 +24,6 @@ typedef std::string str;
*/
typedef std::list<str> list_str;

/**
* @define DELIMITER
* @brief Delimiter used to split strings.
*/
#define DELIMITER " "

/**
* @struct docs
* @brief Structure that stores information about a document.
Expand Down Expand Up @@ -107,23 +103,6 @@ typedef std::set<docs> set_docs;
*/
typedef std::vector<str> vector_str;

/**
* @brief Converts a character to lowercase.
* @param c Character to convert.
* @return Character converted to lowercase.
*/
char to_lowercase(unsigned char c) {
return std::tolower(c);
}

/**
* @brief Splits a string based on a delimiter.
* @param s String to split.
* @param delimiter Delimiter to split the string.
* @return Vector of strings resulting from the split.
*/
vector_str split(str& s, const str& delimiter);

/**
* @brief Adds a new document to the document map.
* @param mp Map of words to lists of documents.
Expand All @@ -149,11 +128,6 @@ list_docs find_doc(map_str_docs& mp, str& word);
*/
list_docs find_answer(map_str_docs& mp, str& input);

/**
* @brief Removes unwanted characters from a string, such as extra spaces.
* @param input String to process.
*/
void shrink_string(std::string* input);
} // namespace inverted_index

#endif // INVERTED_INDEX
21 changes: 1 addition & 20 deletions lib/include/preprocessing/stemmer.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <vector>

#include "exceptions/invalid_pointer_exception.h"
#include "utils/string_operations.h"

namespace stemmer {

Expand Down Expand Up @@ -89,26 +90,6 @@ class RSPL {
* @return `true` if the word ends with 'a', otherwise `false`.
*/
bool endsWithA(const std::string& word);

/**
* @brief Splits a string into parts based on delimiters.
* @param s The string to be split.
* @return A vector containing the parts of the string.
*/
std::vector<std::string> split(std::string& s);

/**
* @brief Removes accents from a string.
* @param input The input string.
* @return The string without accents.
*/
std::string removeAccents(const std::string& input);

/**
* @brief Shrinks the size of a string to normalize it.
* @param input Pointer to the input string.
*/
void shrinkString(std::string* input);
};

} // namespace stemmer
Expand Down
56 changes: 56 additions & 0 deletions lib/include/utils/string_operations.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <unicode/locid.h>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <codecvt>
#include <locale>
#include <string>
#include <unordered_map>
#include <vector>

namespace utils {

/**
* @define DELIMITER
* @brief Delimiter used to split strings.
*/
constexpr std::string DELIMITER = " ";

/**
* @brief Mapping to normalize accented characters to their ASCII
* equivalents.
*/
const std::unordered_map<wchar_t, wchar_t> accentMap_ = {
{L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'},
{L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'},
{L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'},
{L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'},
{L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'},
{L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'},
{L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'},
{L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'},
{L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'},
{L'Ç', L'C'}};
// Funções para manipular string

/**
* @brief Removes unwanted characters from a string, such as extra spaces.
* @param input String to process.
*/
void shrink_string(std::string* input);

/**
* @brief Splits a string based on a delimiter.
* @param s String to split.
* @param delimiter Delimiter to split the string.
* @return Vector of strings resulting from the split.
*/
std::vector<std::string> split(std::string& s, const std::string& delimiter);

/**
* @brief Removes accents from a string.
* @param input The input string.
* @return The string without accents.
*/
std::string removeAccents(const std::string& input);

} // namespace utils
33 changes: 4 additions & 29 deletions lib/src/inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,11 @@

using namespace inverted_index;

vector_str inverted_index::split(str& s, const str& delimiter) {
vector_str tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

void inverted_index::shrink_string(std::string* input) {
if (!input)
return; // Verifica se o ponteiro é válido

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
}

map_str_docs inverted_index::add_doc(map_str_docs& mp,
const str& doc_name,
str& text) {
shrink_string(&text);
auto words = inverted_index::split(text, DELIMITER);
utils::shrink_string(&text);
auto words = utils::split(text, utils::DELIMITER);

for (const auto& word : words) {
docs target = {doc_name, 1};
Expand All @@ -61,8 +36,8 @@ list_docs inverted_index::find_answer(map_str_docs& mp, str& input) {
list_docs result;
set_docs unique_docs;

shrink_string(&input);
auto words = inverted_index::split(input, DELIMITER);
utils::shrink_string(&input);
auto words = utils::split(input, utils::DELIMITER);

for (auto& word : words) {
list_docs docs = inverted_index::find_doc(
Expand Down
55 changes: 3 additions & 52 deletions lib/src/preprocessing/stemmer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,55 +164,6 @@ bool RSPL::endsWithS(const std::string& word) {
return false;
}

std::vector<std::string> RSPL::split(std::string& s) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
std::string delimiter = " ";
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

std::string RSPL::removeAccents(const std::string& input) {
std::wstring winput =
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
std::wstring woutput;
woutput.reserve(winput.size()); // Evitar alocações desnecessárias

// Processar a string como wstring
for (wchar_t ch : winput) {
if (accentMap_.count(ch)) {
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
} else {
woutput.push_back(ch); // Mantém o caractere não acentuado
}
}

// Converter de volta para std::string
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
}

void RSPL::shrinkString(std::string* input) {
try {
if (!input)
throw exceptions::invalid_pointer_exception();

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
} catch (const std::exception& e) {
std::cerr << e.what() << '\n';
}
}

bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
for (const auto& rule : rules) {
// Verificar se a palavra termina com o sufixo especificado
Expand All @@ -234,9 +185,9 @@ bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {

void RSPL::run(std::string* sentence) {
// Separar a sentença em palavras
this->shrinkString(sentence);
utils::shrink_string(sentence);
// std::cout << *sentence << std::endl;
std::vector<std::string> words = this->split(*sentence);
std::vector<std::string> words = utils::split(*sentence, utils::DELIMITER);

for (std::string& word : words) {
// PLURAL REDUCTION
Expand Down Expand Up @@ -271,7 +222,7 @@ void RSPL::run(std::string* sentence) {
}

// Função para remover acentos
word = removeAccents(word);
word = utils::removeAccents(word);
// std::cout << word << std::endl;
}

Expand Down
49 changes: 49 additions & 0 deletions lib/src/utils/string_operations.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "utils/string_operations.h"

namespace utils {

void shrink_string(std::string* input) {
if (!input)
return; // Verifica se o ponteiro é válido

icu::UnicodeString ustr(input->c_str(), "UTF-8");
ustr.toLower();
std::string result;
ustr.toUTF8String(result);
*input = result;
}

std::vector<std::string> split(std::string& s, const std::string& delimiter) {
std::vector<std::string> tokens;
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
tokens.push_back(token);
s.erase(0, pos + delimiter.length());
}
tokens.push_back(s);

return tokens;
}

std::string removeAccents(const std::string& input) {
std::wstring winput =
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
std::wstring woutput;
woutput.reserve(winput.size()); // Evitar alocações desnecessárias

// Processar a string como wstring
for (wchar_t ch : winput) {
if (accentMap_.count(ch)) {
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
} else {
woutput.push_back(ch); // Mantém o caractere não acentuado
}
}

// Converter de volta para std::string
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
}

} // namespace utils

0 comments on commit 438da18

Please sign in to comment.