(fix): corrected setup.py to include SentencePiece model

prabhashj07 · Jul 23, 2024 · 627bc25 · 627bc25
1 parent d2f8b91
commit 627bc25
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 8 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,2 @@
+include nepalikit/tokenization/sentencepiece/model/NepaliKit_sentencepiece.model
+
diff --git a/nepalikit/sentence_operation/extract_sentences.py b/nepalikit/sentence_operation/extract_sentences.py
@@ -12,13 +12,42 @@
 from nepalikit.preprocessing.TextProcessor import TextProcessor
 
 class extract_sentences:
+    """
+    A class to extract sentences from Nepali text, which includes normalization,
+    processing and handling of common edge cases like abbreviations.
+
+    Attributes:
+    ----------
+    text: str, the text to be processed and from which sentences will be extracteed : TextProcessor
+    An instance of TextProcessor to handle text normalization and preprocessing.
+
+    Methods:
+    -------
+    normalize_text() -> str:
+    - Normalize the text using the TextProcessor.
+
+    preprocess_text(normalized_text: str) -> str:
+    - Preprocess the normalized text using the TextProcessor.
+
+    extract_sentences -> list:
+    - Extract and returns a list of sentences from the text.
+    """
     def __init__(self, text: str):
+        """
+        Initializes the extract_sentences class with the provided text and initializes a TextProcessor instance.
+
+        Parameters:
+        text: str, The text to be processed and from which sentences will be extracted.
+        """
         self.text = text
-        self.processor = TextProcessor()  
+        self.processor = TextProcessor() 
 
     def normalize_text(self) -> str:
         """
         Normalize the text using TextProcessor.
+
+        Returns:
+        str: The normalized text.
         """
         # Assuming TextProcessor has a method `normalize` for normalization
         normalized_text = self.processor.normalize_text(self.text)
@@ -27,6 +56,13 @@ def normalize_text(self) -> str:
     def preprocess_text(self, normalized_text: str) -> str:
         """
         Preprocess the text using TextProcessor.
+
+        Parameters:
+        normalize_text: str
+        - The normalized text to be preprocessed. 
+
+        Returns:
+        str: The preprocessed text.
         """
         cleaned_text = self.processor.preprocess_text(normalized_text)
         return cleaned_text
@@ -35,6 +71,9 @@ def extract_sentences(self) -> list:
         """
         Extract sentences splits a given Nepali text into sentences 
         based on punctuation marks and handles common edge cases like abbreviations.
+
+        Returns:
+        list: A list of extracted sentences.
         """
         normalized_text = self.normalize_text()
         cleaned_text = self.preprocess_text(normalized_text)

diff --git a/nepalikit/tokenization/sentencepiece_tokenizer.py b/nepalikit/tokenization/sentencepiece_tokenizer.py
@@ -12,7 +12,15 @@ def __init__(self):
         print("Model found and path is correct.")
 
     def tokenize(self, text):
-        """Tokenizes text using SentencePiece model loaded from file."""
+        """
+        Tokenizes text using SentencePiece model loaded from file.
+
+        Parameters:
+        - text: str, a string to be tokenized.
+
+        Returns:
+        - a list of str: tokenized.
+        """
         model_path = os.path.join(self.this_dir, "sentencepiece", "model", "NepaliKit_sentencepiece.model")
         try:
             sp = spm.SentencePieceProcessor()
@@ -25,6 +33,15 @@ def tokenize(self, text):
         return sp.EncodeAsPieces(text)
 
     def detokenize(self, tokens):
+        """
+        Detokenize text using SentencePiece model loaded from file.
+
+        Parameters:
+        - a list of str: tokenized string to be converted into original form. 
+
+        Returns:
+        - original form: text, string
+        """
         model_path = os.path.join(self.this_dir, "sentencepiece", "model", "NepaliKit_sentencepiece.model")
         try:
             sp = spm.SentencePieceProcessor()

diff --git a/nepalikit/utils/utils.py b/nepalikit/utils/utils.py
@@ -11,24 +11,87 @@
 from collections import Counter
 
 class NepaliTextProcessor:
+    """
+    A class for processing Nepali text, including merging and splitting text,
+    and counting words in sentences and paragraphs.
+
+    Attributes:
+    ----------
+    delimiter: str
+    The delimiter used to split or join text tokens (default - space).
+
+    Methods:
+    -------
+    merge_text(tokens):
+    - Merges a list of tokens into a single string, separated by the delimiter.
+
+    split_text(text):
+    - Splits a text string into a list of tokens using the delimiter.
+
+    count_words(text):
+    - Counts the number of words in a text string by splitting it with the delimiter.
+
+    count_words_in_paragraph(paragraph):
+    - Counts the total number of words in a paragraph, where the paragraph is split into sentences.
+
+    """
     def __init__(self, delimiter=' '):
+        """
+        Initialize the NepaliTextProcessor with a specified delimiter.
+
+        Parameters:
+        delimiter: str(optional), the delimiter used to split or join text tokens(default - space).
+        """
         self.delimiter = delimiter
 
     def merge_text(self, tokens):
-        """Merge tokens into text with a delimiter."""
+        """
+        Merge tokens into text with a delimiter.
+
+        Parameters:
+        tokens: list of str, a list of text tokens to be merged.
+
+        Returns:
+        str, a single string where tokens are joined by the delimiter.
+        """
         return self.delimiter.join(tokens)
 
     def split_text(self, text):
-        """Split text by a delimiter."""
+        """
+        Split text by a delimiter.
+
+        Parameters:
+        text: str, the text string to be split. 
+
+        Returns:
+        list of str, a list of tokens obtained by splitting the text using the delimiter.
+        """
         return text.split(self.delimiter)
 
     def count_words(self, text):
-        """Count total words in a sentence."""
+        """
+        Count total words in a sentence.
+
+        Parameters:
+        text: str, the text string whose words are to be counted.
+
+        Returns:
+        int: the number of words in the text string.
+
+        """
         tokens = self.split_text(text)
         return len(tokens)
 
     def count_words_in_paragraph(self, paragraph):
-        """Count total words in a paragraph."""
+        """
+        Count total words in a paragraph.
+
+        Parameters:
+        str: the paragraph whose words are to be counted.
+
+        Returns:
+        int: the total number of words in the paragraph.
+        """
         total_words = 0
         sentences = paragraph.split('.')
         for sentence in sentences:

diff --git a/setup.py b/setup.py
@@ -12,6 +12,10 @@
         'sentencepiece==0.2.0',
         'regex'
     ],
+    include_package_data=True,
+    package_data={
+        'nepalikit': ['tokenization/sentencepiece/model/NepaliKit_sentencepiece.model'],
+    },
     entry_points={
         'console_scripts': [
             'nepalikit-cli = nepalikit.__main__:main',
@@ -20,7 +24,7 @@
     author='Prabhash Kumar Jha',
     author_email='[email protected]',
     description='A Nepali language processing library',
-    long_description=long_description,
+    long_description=open('README.md').read(),
     long_description_content_type='text/markdown',
     url='https://github.com/prabhashj07/nepalikit.git',
     license='MIT',
@@ -35,9 +39,9 @@
         'Topic :: Text Processing :: Linguistic',
     ],
     python_requires='>=3.7',
-    include_package_data=True,
     project_urls={
         'Bug Reports': 'https://github.com/prabhashj07/nepalikit/issues',
         'Source': 'https://github.com/prabhashj07/nepalikit/',
 },
 )
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		include nepalikit/tokenization/sentencepiece/model/NepaliKit_sentencepiece.model