Merge pull request #21 from ThHuberSG/syllables

adds number_of_n_syllable_words_all function
WSE-research · Nov 5, 2024 · cc8f4b6 · cc8f4b6
2 parents b1bd77d + cb553c2
commit cc8f4b6
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -27,6 +27,7 @@ The following descriptive statistics are supported (`descriptive_statistics.py`
 * Number of syllables `syllable_count`
 * Number of sentences `sentence_count`
 * Number of n-syllable words `number_of_n_syllable_words`
+* Number of n-syllable words for all found syllables `number_of_n_syllable_words_all`
 * Average syllables per word `avg_syllable_per_word`
 * Average word length `avg_word_length`
 * Average sentence length `avg_sentence_length`

diff --git a/linguaf/descriptive_statistics.py b/linguaf/descriptive_statistics.py
@@ -15,7 +15,6 @@
 from linguaf import SUPPORTED_LANGS, __load_json, __check_bool_param, __check_documents_param, __check_lang_param, \
     __check_text_param, __check_words_param
 
-
 LOGGER = logging.getLogger(__name__)
 
 try:
@@ -28,7 +27,6 @@
 PUNCTUATION = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~—«»"""
 STOPWORDS = dict()
 
-
 for language in SUPPORTED_LANGS:
     try:
         # TODO: consider using nltk directly
@@ -171,25 +169,38 @@ def number_of_n_syllable_words(documents: list, lang: str = 'en', n: tuple = (1,
     __check_documents_param(documents)
     __check_lang_param(lang)
 
+    counts = number_of_n_syllable_words_all(documents, lang, remove_stopwords)
+    count = 0
+    for i in range(n[0], n[1]):
+        count += counts.get(i, 0)
+    return count
+
+
+def number_of_n_syllable_words_all(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> dict:
+    """Count each found number of syllables in a list of words.
+
+    Keyword arguments:
+    documents -- the list of documents
+    lang -- language of the words
+    """
+    __check_documents_param(documents)
+    __check_lang_param(lang)
+
     # TODO: refactor duplicate code!
     unsupported_langs = ['zh', 'hy']
     if lang in unsupported_langs:
         raise ValueError(f"Syllable counting is currently not supported for the language " + lang + "!")
         # TODO: chinese does have syllables! so this should be supported eventually
-        # however, chinese does not support hyphenation, so the implementation below does not work for it! 
+        # however, chinese does not support hyphenation, so the implementation below does not work for it!
 
     words = get_words(documents, lang, remove_stopwords)
-    if n[0] < 1 or n[1] <= n[0]:
-        raise ValueError(f"The given n parameter isn't correct: {n}. n=tuple(x,y), x>0, y>x.")
 
-    count = 0
+    counts = collections.defaultdict(int)
     dic = pyphen.Pyphen(lang=lang)  # TODO: match language
     for word in words:
         syl_cnt = len(dic.inserted(word).split('-'))
-        for i in range(n[0], n[1]):
-            if syl_cnt == i:
-                count += 1
-    return count
+        counts[syl_cnt] += 1
+    return counts
 
 
 def get_words(documents: list, lang: str = 'en', remove_stopwords: bool = False) -> list: