From 1dda89e9d836414afc990301ed7f77f76417a270 Mon Sep 17 00:00:00 2001
From: Pedro Bianchini de Quadros <pedrobiqua@gmail.com>
Date: Mon, 27 Jan 2025 11:12:56 -0300
Subject: [PATCH] refactor(string_operations.h/cpp): add new util operations

Change files that use these operations
---
 lib/CMakeLists.txt                    |  1 +
 lib/include/inverted_index.h          | 30 +-------------
 lib/include/preprocessing/stemmer.h   | 21 +---------
 lib/include/utils/string_operations.h | 56 +++++++++++++++++++++++++++
 lib/src/inverted_index.cpp            | 33 ++--------------
 lib/src/preprocessing/stemmer.cpp     | 55 ++------------------------
 lib/src/utils/string_operations.cpp   | 49 +++++++++++++++++++++++
 7 files changed, 116 insertions(+), 129 deletions(-)
 create mode 100644 lib/include/utils/string_operations.h
 create mode 100644 lib/src/utils/string_operations.cpp
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 9f60196..407cfd8 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -8,6 +8,7 @@ add_library(
             src/inverted_index.cpp
             src/preprocessing/stemmer.cpp
             src/exceptions/invalid_pointer_exception.cpp
+            src/utils/string_operations.cpp
 )
 
 target_include_directories(search_engine PUBLIC include)
diff --git a/lib/include/inverted_index.h b/lib/include/inverted_index.h
index 58d3ddb..cccf6d0 100644
--- a/lib/include/inverted_index.h
+++ b/lib/include/inverted_index.h
@@ -9,6 +9,8 @@
 #include <string>
 #include <vector>
 
+#include "utils/string_operations.h"
+
 namespace inverted_index {
 /**
  * @typedef str
@@ -22,12 +24,6 @@ typedef std::string str;
  */
 typedef std::list<str> list_str;
 
-/**
- * @define DELIMITER
- * @brief Delimiter used to split strings.
- */
-#define DELIMITER " "
-
 /**
  * @struct docs
  * @brief Structure that stores information about a document.
@@ -107,23 +103,6 @@ typedef std::set<docs> set_docs;
  */
 typedef std::vector<str> vector_str;
 
-/**
- * @brief Converts a character to lowercase.
- * @param c Character to convert.
- * @return Character converted to lowercase.
- */
-char to_lowercase(unsigned char c) {
-    return std::tolower(c);
-}
-
-/**
- * @brief Splits a string based on a delimiter.
- * @param s String to split.
- * @param delimiter Delimiter to split the string.
- * @return Vector of strings resulting from the split.
- */
-vector_str split(str& s, const str& delimiter);
-
 /**
  * @brief Adds a new document to the document map.
  * @param mp Map of words to lists of documents.
@@ -149,11 +128,6 @@ list_docs find_doc(map_str_docs& mp, str& word);
  */
 list_docs find_answer(map_str_docs& mp, str& input);
 
-/**
- * @brief Removes unwanted characters from a string, such as extra spaces.
- * @param input String to process.
- */
-void shrink_string(std::string* input);
 }  // namespace inverted_index
 
 #endif  // INVERTED_INDEX
diff --git a/lib/include/preprocessing/stemmer.h b/lib/include/preprocessing/stemmer.h
index 23d82d3..d52c32e 100644
--- a/lib/include/preprocessing/stemmer.h
+++ b/lib/include/preprocessing/stemmer.h
@@ -12,6 +12,7 @@
 #include <vector>
 
 #include "exceptions/invalid_pointer_exception.h"
+#include "utils/string_operations.h"
 
 namespace stemmer {
 
@@ -89,26 +90,6 @@ class RSPL {
      * @return `true` if the word ends with 'a', otherwise `false`.
      */
     bool endsWithA(const std::string& word);
-
-    /**
-     * @brief Splits a string into parts based on delimiters.
-     * @param s The string to be split.
-     * @return A vector containing the parts of the string.
-     */
-    std::vector<std::string> split(std::string& s);
-
-    /**
-     * @brief Removes accents from a string.
-     * @param input The input string.
-     * @return The string without accents.
-     */
-    std::string removeAccents(const std::string& input);
-
-    /**
-     * @brief Shrinks the size of a string to normalize it.
-     * @param input Pointer to the input string.
-     */
-    void shrinkString(std::string* input);
 };
 
 }  // namespace stemmer
diff --git a/lib/include/utils/string_operations.h b/lib/include/utils/string_operations.h
new file mode 100644
index 0000000..36e7003
--- /dev/null
+++ b/lib/include/utils/string_operations.h
@@ -0,0 +1,56 @@
+#include <unicode/locid.h>
+#include <unicode/unistr.h>
+#include <unicode/ustream.h>
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace utils {
+
+/**
+ * @define DELIMITER
+ * @brief Delimiter used to split strings.
+ */
+constexpr std::string DELIMITER = " ";
+
+/**
+ * @brief Mapping to normalize accented characters to their ASCII
+ * equivalents.
+ */
+const std::unordered_map<wchar_t, wchar_t> accentMap_ = {
+    {L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'},
+    {L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'},
+    {L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'},
+    {L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'},
+    {L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'},
+    {L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'},
+    {L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'},
+    {L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'},
+    {L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'},
+    {L'Ç', L'C'}};
+// Funções para manipular string
+
+/**
+ * @brief Removes unwanted characters from a string, such as extra spaces.
+ * @param input String to process.
+ */
+void shrink_string(std::string* input);
+
+/**
+ * @brief Splits a string based on a delimiter.
+ * @param s String to split.
+ * @param delimiter Delimiter to split the string.
+ * @return Vector of strings resulting from the split.
+ */
+std::vector<std::string> split(std::string& s, const std::string& delimiter);
+
+/**
+ * @brief Removes accents from a string.
+ * @param input The input string.
+ * @return The string without accents.
+ */
+std::string removeAccents(const std::string& input);
+
+}  // namespace utils
diff --git a/lib/src/inverted_index.cpp b/lib/src/inverted_index.cpp
index 269ea45..bc4a71e 100644
--- a/lib/src/inverted_index.cpp
+++ b/lib/src/inverted_index.cpp
@@ -8,36 +8,11 @@
 
 using namespace inverted_index;
 
-vector_str inverted_index::split(str& s, const str& delimiter) {
-    vector_str tokens;
-    size_t pos = 0;
-    std::string token;
-    while ((pos = s.find(delimiter)) != std::string::npos) {
-        token = s.substr(0, pos);
-        tokens.push_back(token);
-        s.erase(0, pos + delimiter.length());
-    }
-    tokens.push_back(s);
-
-    return tokens;
-}
-
-void inverted_index::shrink_string(std::string* input) {
-    if (!input)
-        return;  // Verifica se o ponteiro é válido
-
-    icu::UnicodeString ustr(input->c_str(), "UTF-8");
-    ustr.toLower();
-    std::string result;
-    ustr.toUTF8String(result);
-    *input = result;
-}
-
 map_str_docs inverted_index::add_doc(map_str_docs& mp,
                                      const str& doc_name,
                                      str& text) {
-    shrink_string(&text);
-    auto words = inverted_index::split(text, DELIMITER);
+    utils::shrink_string(&text);
+    auto words = utils::split(text, utils::DELIMITER);
 
     for (const auto& word : words) {
         docs target = {doc_name, 1};
@@ -61,8 +36,8 @@ list_docs inverted_index::find_answer(map_str_docs& mp, str& input) {
     list_docs result;
     set_docs unique_docs;
 
-    shrink_string(&input);
-    auto words = inverted_index::split(input, DELIMITER);
+    utils::shrink_string(&input);
+    auto words = utils::split(input, utils::DELIMITER);
 
     for (auto& word : words) {
         list_docs docs = inverted_index::find_doc(
diff --git a/lib/src/preprocessing/stemmer.cpp b/lib/src/preprocessing/stemmer.cpp
index b1c884d..08cf9b9 100644
--- a/lib/src/preprocessing/stemmer.cpp
+++ b/lib/src/preprocessing/stemmer.cpp
@@ -164,55 +164,6 @@ bool RSPL::endsWithS(const std::string& word) {
     return false;
 }
 
-std::vector<std::string> RSPL::split(std::string& s) {
-    std::vector<std::string> tokens;
-    size_t pos = 0;
-    std::string token;
-    std::string delimiter = " ";
-    while ((pos = s.find(delimiter)) != std::string::npos) {
-        token = s.substr(0, pos);
-        tokens.push_back(token);
-        s.erase(0, pos + delimiter.length());
-    }
-    tokens.push_back(s);
-
-    return tokens;
-}
-
-std::string RSPL::removeAccents(const std::string& input) {
-    std::wstring winput =
-        std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
-    std::wstring woutput;
-    woutput.reserve(winput.size());  // Evitar alocações desnecessárias
-
-    // Processar a string como wstring
-    for (wchar_t ch : winput) {
-        if (accentMap_.count(ch)) {
-            woutput.push_back(accentMap_.at(ch));  // Substituir acentuados
-        } else {
-            woutput.push_back(ch);  // Mantém o caractere não acentuado
-        }
-    }
-
-    // Converter de volta para std::string
-    return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
-}
-
-void RSPL::shrinkString(std::string* input) {
-    try {
-        if (!input)
-            throw exceptions::invalid_pointer_exception();
-
-        icu::UnicodeString ustr(input->c_str(), "UTF-8");
-        ustr.toLower();
-        std::string result;
-        ustr.toUTF8String(result);
-        *input = result;
-    } catch (const std::exception& e) {
-        std::cerr << e.what() << '\n';
-    }
-}
-
 bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
     for (const auto& rule : rules) {
         // Verificar se a palavra termina com o sufixo especificado
@@ -234,9 +185,9 @@ bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
 
 void RSPL::run(std::string* sentence) {
     // Separar a sentença em palavras
-    this->shrinkString(sentence);
+    utils::shrink_string(sentence);
     // std::cout << *sentence << std::endl;
-    std::vector<std::string> words = this->split(*sentence);
+    std::vector<std::string> words = utils::split(*sentence, utils::DELIMITER);
 
     for (std::string& word : words) {
         // PLURAL REDUCTION
@@ -271,7 +222,7 @@ void RSPL::run(std::string* sentence) {
         }
 
         // Função para remover acentos
-        word = removeAccents(word);
+        word = utils::removeAccents(word);
         // std::cout << word << std::endl;
     }
 
diff --git a/lib/src/utils/string_operations.cpp b/lib/src/utils/string_operations.cpp
new file mode 100644
index 0000000..3ce320a
--- /dev/null
+++ b/lib/src/utils/string_operations.cpp
@@ -0,0 +1,49 @@
+#include "utils/string_operations.h"
+
+namespace utils {
+
+void shrink_string(std::string* input) {
+    if (!input)
+        return;  // Verifica se o ponteiro é válido
+
+    icu::UnicodeString ustr(input->c_str(), "UTF-8");
+    ustr.toLower();
+    std::string result;
+    ustr.toUTF8String(result);
+    *input = result;
+}
+
+std::vector<std::string> split(std::string& s, const std::string& delimiter) {
+    std::vector<std::string> tokens;
+    size_t pos = 0;
+    std::string token;
+    while ((pos = s.find(delimiter)) != std::string::npos) {
+        token = s.substr(0, pos);
+        tokens.push_back(token);
+        s.erase(0, pos + delimiter.length());
+    }
+    tokens.push_back(s);
+
+    return tokens;
+}
+
+std::string removeAccents(const std::string& input) {
+    std::wstring winput =
+        std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
+    std::wstring woutput;
+    woutput.reserve(winput.size());  // Evitar alocações desnecessárias
+
+    // Processar a string como wstring
+    for (wchar_t ch : winput) {
+        if (accentMap_.count(ch)) {
+            woutput.push_back(accentMap_.at(ch));  // Substituir acentuados
+        } else {
+            woutput.push_back(ch);  // Mantém o caractere não acentuado
+        }
+    }
+
+    // Converter de volta para std::string
+    return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
+}
+
+}  // namespace utils