Skip to content

Commit

Permalink
(fix): corrected setup.py to include SentencePiece model
Browse files Browse the repository at this point in the history
  • Loading branch information
prabhashj07 committed Jul 23, 2024
1 parent d2f8b91 commit 627bc25
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 8 deletions.
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
include nepalikit/tokenization/sentencepiece/model/NepaliKit_sentencepiece.model

41 changes: 40 additions & 1 deletion nepalikit/sentence_operation/extract_sentences.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,42 @@
from nepalikit.preprocessing.TextProcessor import TextProcessor

class extract_sentences:
"""
A class to extract sentences from Nepali text, which includes normalization,
processing and handling of common edge cases like abbreviations.
Attributes:
----------
text: str, the text to be processed and from which sentences will be extracteed : TextProcessor
An instance of TextProcessor to handle text normalization and preprocessing.
Methods:
-------
normalize_text() -> str:
- Normalize the text using the TextProcessor.
preprocess_text(normalized_text: str) -> str:
- Preprocess the normalized text using the TextProcessor.
extract_sentences -> list:
- Extract and returns a list of sentences from the text.
"""
def __init__(self, text: str):
"""
Initializes the extract_sentences class with the provided text and initializes a TextProcessor instance.
Parameters:
text: str, The text to be processed and from which sentences will be extracted.
"""
self.text = text
self.processor = TextProcessor()
self.processor = TextProcessor()

def normalize_text(self) -> str:
"""
Normalize the text using TextProcessor.
Returns:
str: The normalized text.
"""
# Assuming TextProcessor has a method `normalize` for normalization
normalized_text = self.processor.normalize_text(self.text)
Expand All @@ -27,6 +56,13 @@ def normalize_text(self) -> str:
def preprocess_text(self, normalized_text: str) -> str:
"""
Preprocess the text using TextProcessor.
Parameters:
normalize_text: str
- The normalized text to be preprocessed.
Returns:
str: The preprocessed text.
"""
cleaned_text = self.processor.preprocess_text(normalized_text)
return cleaned_text
Expand All @@ -35,6 +71,9 @@ def extract_sentences(self) -> list:
"""
Extract sentences splits a given Nepali text into sentences
based on punctuation marks and handles common edge cases like abbreviations.
Returns:
list: A list of extracted sentences.
"""
normalized_text = self.normalize_text()
cleaned_text = self.preprocess_text(normalized_text)
Expand Down
19 changes: 18 additions & 1 deletion nepalikit/tokenization/sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@ def __init__(self):
print("Model found and path is correct.")

def tokenize(self, text):
"""Tokenizes text using SentencePiece model loaded from file."""
"""
Tokenizes text using SentencePiece model loaded from file.
Parameters:
- text: str, a string to be tokenized.
Returns:
- a list of str: tokenized.
"""
model_path = os.path.join(self.this_dir, "sentencepiece", "model", "NepaliKit_sentencepiece.model")
try:
sp = spm.SentencePieceProcessor()
Expand All @@ -25,6 +33,15 @@ def tokenize(self, text):
return sp.EncodeAsPieces(text)

def detokenize(self, tokens):
"""
Detokenize text using SentencePiece model loaded from file.
Parameters:
- a list of str: tokenized string to be converted into original form.
Returns:
- original form: text, string
"""
model_path = os.path.join(self.this_dir, "sentencepiece", "model", "NepaliKit_sentencepiece.model")
try:
sp = spm.SentencePieceProcessor()
Expand Down
71 changes: 67 additions & 4 deletions nepalikit/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,87 @@
from collections import Counter

class NepaliTextProcessor:
"""
A class for processing Nepali text, including merging and splitting text,
and counting words in sentences and paragraphs.
Attributes:
----------
delimiter: str
The delimiter used to split or join text tokens (default - space).
Methods:
-------
merge_text(tokens):
- Merges a list of tokens into a single string, separated by the delimiter.
split_text(text):
- Splits a text string into a list of tokens using the delimiter.
count_words(text):
- Counts the number of words in a text string by splitting it with the delimiter.
count_words_in_paragraph(paragraph):
- Counts the total number of words in a paragraph, where the paragraph is split into sentences.
"""
def __init__(self, delimiter=' '):
"""
Initialize the NepaliTextProcessor with a specified delimiter.
Parameters:
delimiter: str(optional), the delimiter used to split or join text tokens(default - space).
"""
self.delimiter = delimiter

def merge_text(self, tokens):
"""Merge tokens into text with a delimiter."""
"""
Merge tokens into text with a delimiter.
Parameters:
tokens: list of str, a list of text tokens to be merged.
Returns:
str, a single string where tokens are joined by the delimiter.
"""
return self.delimiter.join(tokens)

def split_text(self, text):
"""Split text by a delimiter."""
"""
Split text by a delimiter.
Parameters:
text: str, the text string to be split.
Returns:
list of str, a list of tokens obtained by splitting the text using the delimiter.
"""
return text.split(self.delimiter)

def count_words(self, text):
"""Count total words in a sentence."""
"""
Count total words in a sentence.
Parameters:
text: str, the text string whose words are to be counted.
Returns:
int: the number of words in the text string.
"""
tokens = self.split_text(text)
return len(tokens)

def count_words_in_paragraph(self, paragraph):
"""Count total words in a paragraph."""
"""
Count total words in a paragraph.
Parameters:
str: the paragraph whose words are to be counted.
Returns:
int: the total number of words in the paragraph.
"""
total_words = 0
sentences = paragraph.split('.')
for sentence in sentences:
Expand Down
8 changes: 6 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
'sentencepiece==0.2.0',
'regex'
],
include_package_data=True,
package_data={
'nepalikit': ['tokenization/sentencepiece/model/NepaliKit_sentencepiece.model'],
},
entry_points={
'console_scripts': [
'nepalikit-cli = nepalikit.__main__:main',
Expand All @@ -20,7 +24,7 @@
author='Prabhash Kumar Jha',
author_email='[email protected]',
description='A Nepali language processing library',
long_description=long_description,
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
url='https://github.com/prabhashj07/nepalikit.git',
license='MIT',
Expand All @@ -35,9 +39,9 @@
'Topic :: Text Processing :: Linguistic',
],
python_requires='>=3.7',
include_package_data=True,
project_urls={
'Bug Reports': 'https://github.com/prabhashj07/nepalikit/issues',
'Source': 'https://github.com/prabhashj07/nepalikit/',
},
)

0 comments on commit 627bc25

Please sign in to comment.