Skip to content

Commit

Permalink
Merge pull request #21 from ThHuberSG/syllables
Browse files Browse the repository at this point in the history
adds number_of_n_syllable_words_all function
  • Loading branch information
Perevalov authored Nov 5, 2024
2 parents b1bd77d + cb553c2 commit cc8f4b6
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 10 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ The following descriptive statistics are supported (`descriptive_statistics.py`
* Number of syllables `syllable_count`
* Number of sentences `sentence_count`
* Number of n-syllable words `number_of_n_syllable_words`
* Number of n-syllable words for all found syllables `number_of_n_syllable_words_all`
* Average syllables per word `avg_syllable_per_word`
* Average word length `avg_word_length`
* Average sentence length `avg_sentence_length`
Expand Down
31 changes: 21 additions & 10 deletions linguaf/descriptive_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from linguaf import SUPPORTED_LANGS, __load_json, __check_bool_param, __check_documents_param, __check_lang_param, \
__check_text_param, __check_words_param


LOGGER = logging.getLogger(__name__)

try:
Expand All @@ -28,7 +27,6 @@
PUNCTUATION = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—«»"""
STOPWORDS = dict()


for language in SUPPORTED_LANGS:
try:
# TODO: consider using nltk directly
Expand Down Expand Up @@ -171,25 +169,38 @@ def number_of_n_syllable_words(documents: list, lang: str = 'en', n: tuple = (1,
__check_documents_param(documents)
__check_lang_param(lang)

counts = number_of_n_syllable_words_all(documents, lang, remove_stopwords)
count = 0
for i in range(n[0], n[1]):
count += counts.get(i, 0)
return count


def number_of_n_syllable_words_all(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> dict:
"""Count each found number of syllables in a list of words.
Keyword arguments:
documents -- the list of documents
lang -- language of the words
"""
__check_documents_param(documents)
__check_lang_param(lang)

# TODO: refactor duplicate code!
unsupported_langs = ['zh', 'hy']
if lang in unsupported_langs:
raise ValueError(f"Syllable counting is currently not supported for the language " + lang + "!")
# TODO: chinese does have syllables! so this should be supported eventually
# however, chinese does not support hyphenation, so the implementation below does not work for it!
# however, chinese does not support hyphenation, so the implementation below does not work for it!

words = get_words(documents, lang, remove_stopwords)
if n[0] < 1 or n[1] <= n[0]:
raise ValueError(f"The given n parameter isn't correct: {n}. n=tuple(x,y), x>0, y>x.")

count = 0
counts = collections.defaultdict(int)
dic = pyphen.Pyphen(lang=lang) # TODO: match language
for word in words:
syl_cnt = len(dic.inserted(word).split('-'))
for i in range(n[0], n[1]):
if syl_cnt == i:
count += 1
return count
counts[syl_cnt] += 1
return counts


def get_words(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> list:
Expand Down

0 comments on commit cc8f4b6

Please sign in to comment.