From 1dda89e9d836414afc990301ed7f77f76417a270 Mon Sep 17 00:00:00 2001 From: Pedro Bianchini de Quadros Date: Mon, 27 Jan 2025 11:12:56 -0300 Subject: [PATCH] refactor(string_operations.h/cpp): add new util operations Change files that use these operations --- lib/CMakeLists.txt | 1 + lib/include/inverted_index.h | 30 +------------- lib/include/preprocessing/stemmer.h | 21 +--------- lib/include/utils/string_operations.h | 56 +++++++++++++++++++++++++++ lib/src/inverted_index.cpp | 33 ++-------------- lib/src/preprocessing/stemmer.cpp | 55 ++------------------------ lib/src/utils/string_operations.cpp | 49 +++++++++++++++++++++++ 7 files changed, 116 insertions(+), 129 deletions(-) create mode 100644 lib/include/utils/string_operations.h create mode 100644 lib/src/utils/string_operations.cpp diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 9f60196..407cfd8 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -8,6 +8,7 @@ add_library( src/inverted_index.cpp src/preprocessing/stemmer.cpp src/exceptions/invalid_pointer_exception.cpp + src/utils/string_operations.cpp ) target_include_directories(search_engine PUBLIC include) diff --git a/lib/include/inverted_index.h b/lib/include/inverted_index.h index 58d3ddb..cccf6d0 100644 --- a/lib/include/inverted_index.h +++ b/lib/include/inverted_index.h @@ -9,6 +9,8 @@ #include #include +#include "utils/string_operations.h" + namespace inverted_index { /** * @typedef str @@ -22,12 +24,6 @@ typedef std::string str; */ typedef std::list list_str; -/** - * @define DELIMITER - * @brief Delimiter used to split strings. - */ -#define DELIMITER " " - /** * @struct docs * @brief Structure that stores information about a document. @@ -107,23 +103,6 @@ typedef std::set set_docs; */ typedef std::vector vector_str; -/** - * @brief Converts a character to lowercase. - * @param c Character to convert. - * @return Character converted to lowercase. - */ -char to_lowercase(unsigned char c) { - return std::tolower(c); -} - -/** - * @brief Splits a string based on a delimiter. - * @param s String to split. - * @param delimiter Delimiter to split the string. - * @return Vector of strings resulting from the split. - */ -vector_str split(str& s, const str& delimiter); - /** * @brief Adds a new document to the document map. * @param mp Map of words to lists of documents. @@ -149,11 +128,6 @@ list_docs find_doc(map_str_docs& mp, str& word); */ list_docs find_answer(map_str_docs& mp, str& input); -/** - * @brief Removes unwanted characters from a string, such as extra spaces. - * @param input String to process. - */ -void shrink_string(std::string* input); } // namespace inverted_index #endif // INVERTED_INDEX diff --git a/lib/include/preprocessing/stemmer.h b/lib/include/preprocessing/stemmer.h index 23d82d3..d52c32e 100644 --- a/lib/include/preprocessing/stemmer.h +++ b/lib/include/preprocessing/stemmer.h @@ -12,6 +12,7 @@ #include #include "exceptions/invalid_pointer_exception.h" +#include "utils/string_operations.h" namespace stemmer { @@ -89,26 +90,6 @@ class RSPL { * @return `true` if the word ends with 'a', otherwise `false`. */ bool endsWithA(const std::string& word); - - /** - * @brief Splits a string into parts based on delimiters. - * @param s The string to be split. - * @return A vector containing the parts of the string. - */ - std::vector split(std::string& s); - - /** - * @brief Removes accents from a string. - * @param input The input string. - * @return The string without accents. - */ - std::string removeAccents(const std::string& input); - - /** - * @brief Shrinks the size of a string to normalize it. - * @param input Pointer to the input string. - */ - void shrinkString(std::string* input); }; } // namespace stemmer diff --git a/lib/include/utils/string_operations.h b/lib/include/utils/string_operations.h new file mode 100644 index 0000000..36e7003 --- /dev/null +++ b/lib/include/utils/string_operations.h @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace utils { + +/** + * @define DELIMITER + * @brief Delimiter used to split strings. + */ +constexpr std::string DELIMITER = " "; + +/** + * @brief Mapping to normalize accented characters to their ASCII + * equivalents. + */ +const std::unordered_map accentMap_ = { + {L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'}, + {L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'}, + {L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'}, + {L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'}, + {L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'}, + {L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'}, + {L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'}, + {L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'}, + {L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'}, + {L'Ç', L'C'}}; +// Funções para manipular string + +/** + * @brief Removes unwanted characters from a string, such as extra spaces. + * @param input String to process. + */ +void shrink_string(std::string* input); + +/** + * @brief Splits a string based on a delimiter. + * @param s String to split. + * @param delimiter Delimiter to split the string. + * @return Vector of strings resulting from the split. + */ +std::vector split(std::string& s, const std::string& delimiter); + +/** + * @brief Removes accents from a string. + * @param input The input string. + * @return The string without accents. + */ +std::string removeAccents(const std::string& input); + +} // namespace utils diff --git a/lib/src/inverted_index.cpp b/lib/src/inverted_index.cpp index 269ea45..bc4a71e 100644 --- a/lib/src/inverted_index.cpp +++ b/lib/src/inverted_index.cpp @@ -8,36 +8,11 @@ using namespace inverted_index; -vector_str inverted_index::split(str& s, const str& delimiter) { - vector_str tokens; - size_t pos = 0; - std::string token; - while ((pos = s.find(delimiter)) != std::string::npos) { - token = s.substr(0, pos); - tokens.push_back(token); - s.erase(0, pos + delimiter.length()); - } - tokens.push_back(s); - - return tokens; -} - -void inverted_index::shrink_string(std::string* input) { - if (!input) - return; // Verifica se o ponteiro é válido - - icu::UnicodeString ustr(input->c_str(), "UTF-8"); - ustr.toLower(); - std::string result; - ustr.toUTF8String(result); - *input = result; -} - map_str_docs inverted_index::add_doc(map_str_docs& mp, const str& doc_name, str& text) { - shrink_string(&text); - auto words = inverted_index::split(text, DELIMITER); + utils::shrink_string(&text); + auto words = utils::split(text, utils::DELIMITER); for (const auto& word : words) { docs target = {doc_name, 1}; @@ -61,8 +36,8 @@ list_docs inverted_index::find_answer(map_str_docs& mp, str& input) { list_docs result; set_docs unique_docs; - shrink_string(&input); - auto words = inverted_index::split(input, DELIMITER); + utils::shrink_string(&input); + auto words = utils::split(input, utils::DELIMITER); for (auto& word : words) { list_docs docs = inverted_index::find_doc( diff --git a/lib/src/preprocessing/stemmer.cpp b/lib/src/preprocessing/stemmer.cpp index b1c884d..08cf9b9 100644 --- a/lib/src/preprocessing/stemmer.cpp +++ b/lib/src/preprocessing/stemmer.cpp @@ -164,55 +164,6 @@ bool RSPL::endsWithS(const std::string& word) { return false; } -std::vector RSPL::split(std::string& s) { - std::vector tokens; - size_t pos = 0; - std::string token; - std::string delimiter = " "; - while ((pos = s.find(delimiter)) != std::string::npos) { - token = s.substr(0, pos); - tokens.push_back(token); - s.erase(0, pos + delimiter.length()); - } - tokens.push_back(s); - - return tokens; -} - -std::string RSPL::removeAccents(const std::string& input) { - std::wstring winput = - std::wstring_convert>().from_bytes(input); - std::wstring woutput; - woutput.reserve(winput.size()); // Evitar alocações desnecessárias - - // Processar a string como wstring - for (wchar_t ch : winput) { - if (accentMap_.count(ch)) { - woutput.push_back(accentMap_.at(ch)); // Substituir acentuados - } else { - woutput.push_back(ch); // Mantém o caractere não acentuado - } - } - - // Converter de volta para std::string - return std::wstring_convert>().to_bytes(woutput); -} - -void RSPL::shrinkString(std::string* input) { - try { - if (!input) - throw exceptions::invalid_pointer_exception(); - - icu::UnicodeString ustr(input->c_str(), "UTF-8"); - ustr.toLower(); - std::string result; - ustr.toUTF8String(result); - *input = result; - } catch (const std::exception& e) { - std::cerr << e.what() << '\n'; - } -} - bool RSPL::applyRules(std::string& word, const std::vector& rules) { for (const auto& rule : rules) { // Verificar se a palavra termina com o sufixo especificado @@ -234,9 +185,9 @@ bool RSPL::applyRules(std::string& word, const std::vector& rules) { void RSPL::run(std::string* sentence) { // Separar a sentença em palavras - this->shrinkString(sentence); + utils::shrink_string(sentence); // std::cout << *sentence << std::endl; - std::vector words = this->split(*sentence); + std::vector words = utils::split(*sentence, utils::DELIMITER); for (std::string& word : words) { // PLURAL REDUCTION @@ -271,7 +222,7 @@ void RSPL::run(std::string* sentence) { } // Função para remover acentos - word = removeAccents(word); + word = utils::removeAccents(word); // std::cout << word << std::endl; } diff --git a/lib/src/utils/string_operations.cpp b/lib/src/utils/string_operations.cpp new file mode 100644 index 0000000..3ce320a --- /dev/null +++ b/lib/src/utils/string_operations.cpp @@ -0,0 +1,49 @@ +#include "utils/string_operations.h" + +namespace utils { + +void shrink_string(std::string* input) { + if (!input) + return; // Verifica se o ponteiro é válido + + icu::UnicodeString ustr(input->c_str(), "UTF-8"); + ustr.toLower(); + std::string result; + ustr.toUTF8String(result); + *input = result; +} + +std::vector split(std::string& s, const std::string& delimiter) { + std::vector tokens; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + tokens.push_back(token); + s.erase(0, pos + delimiter.length()); + } + tokens.push_back(s); + + return tokens; +} + +std::string removeAccents(const std::string& input) { + std::wstring winput = + std::wstring_convert>().from_bytes(input); + std::wstring woutput; + woutput.reserve(winput.size()); // Evitar alocações desnecessárias + + // Processar a string como wstring + for (wchar_t ch : winput) { + if (accentMap_.count(ch)) { + woutput.push_back(accentMap_.at(ch)); // Substituir acentuados + } else { + woutput.push_back(ch); // Mantém o caractere não acentuado + } + } + + // Converter de volta para std::string + return std::wstring_convert>().to_bytes(woutput); +} + +} // namespace utils