Merge pull request #20 from novoic/dev

Dev
novoic · Jul 23, 2020 · 9fb31ea · 9fb31ea
2 parents 2315b7e + 58f6c44
commit 9fb31ea
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc:
     content = open('example_configs/example_document.txt').read()
     doc = doc_proc.analyze(content, 'string')
 
-res = doc.compute_features('noun_rate')
+res = doc.compute_features(['noun_rate'])
 print(res)
  ```  
 
@@ -90,7 +90,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc:
     content = open('example_configs/example_document.txt').read()
     doc = doc_proc.analyze(content, 'string')
 
-res = doc.compute_features('noun_rate')
+res = doc.compute_features(['noun_rate', 'verb_rate'])
 print(res)
  ```  
 
@@ -104,7 +104,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc:
     content = open('example_configs/example_document.json').read()
     doc = doc_proc.analyze(content, 'json')
 
-res = doc.compute_features('speech_rate')
+res = doc.compute_features(['speech_rate'])
 print(res)
 ```
 

diff --git a/bin/blabla b/bin/blabla
@@ -35,7 +35,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path
 				input_format (str): The format of the input (string / json)
 
 			Returns:
-				None: 
+				None:
 	"""
 	output_file = open(output_file_path, 'w')
 	features_json = yaml.load(open(features_yaml_path, 'r'))
@@ -54,7 +54,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path
 			with open(os.path.join(input_dir_path, input_fp)) as f:
 				content = f.read()
 			doc = doc_proc.analyze(content, input_format)
-			res_json = doc.compute_features(*features_json['features'])
+			res_json = doc.compute_features(features_json['features'])
 			res = {}
 			res = {key: [val] for key, val in res_json.items()}
 			df = df.append(pd.DataFrame.from_dict(res))
@@ -63,7 +63,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path
 
 if __name__ == "__main__":
 	"""The main Command Line Interface to BlaBla. We provide the path to the features YAML file, path to the
-			input file and a path to the output csv file that will contain the output features. We also 
+			input file and a path to the output csv file that will contain the output features. We also
 			specify the input format which is either a string or a json.
 
 			Args:

diff --git a/blabla/document_engine.py b/blabla/document_engine.py
@@ -341,18 +341,6 @@ def auxiliary_rate(self, **kwargs):
         """
         return self._extract_lexico_semantic_features('auxiliary_rate', **kwargs)['auxiliary_rate']
 
-    def conjuction_rate(self, **kwargs):
-        """Extract the conjuction rate.
-            Ref: https://pubmed.ncbi.nlm.nih.gov/28321196/
-
-            Args:
-                kwargs (list): Optional arguments for threshold values
-
-            Returns:
-                The conjuction rate across all sentence objects
-        """
-        return self._extract_lexico_semantic_features('conjuction_rate', **kwargs)['conjuction_rate']
-
     def determiner_rate(self, **kwargs):
         """Extract the determiner rate.
             Ref: https://pubmed.ncbi.nlm.nih.gov/28321196/

diff --git a/example_configs/features.yaml b/example_configs/features.yaml
@@ -1,76 +1,69 @@
 language: "en"
 features:
-
-
-"num_pauses"
-"total_pause_time"
-"mean_pause_duration"
-"between_utterance_pause_duration"
-"pause_between_utterance_duration"
-"pause_duration"
-"hesitation_ratio"
-"pause_duration_for_hesitation"
-"speech_rate"
-"maximum_speech_rate"
-"num_rapid_sentences"
-"total_phonation_time"
-"standardized_phonation_time"
-"total_locution_time"
-"noun_rate"
-"verb_rate"
-"demonstrative_rate"
-"adjective_rate"
-"adposition_rate"
-"adverb_rate"
-"auxiliary_rate"
-"conjunction_rate"
-"determiner_rate"
-"injection_rate"
-"numeral_rate"
-"particle_rate"
-"pronoun_rate"
-"proper_noun_rate"
-"punctutation_rate"
-"subordinating_conjunction_rate"
-"symbol_rate"
-"possessive_rate"
-"noun_verb_ratio"
-"noun_ratio"
-"pronoun_noun_ratio"
-"closed_class_word_rate"
-"open_class_word_rate"
-"total_dependency_distance"
-"average_dependency_distance"
-"total_dependency_distance"
-"total_dependencies"
-"average_dependencies"
-"content_density"
-"idea_density"
-"honore_statistic"
-"brunet_index"
-"type_token_ratio"
-"word_length"
-"prop_inflected_verbs"
-"prop_auxiliary_verbs"
-"prop_gerund_verbs"
-"prop_participles"
-"num_clauses"
-"clause_rate"
-"num_dependent_clauses"
-"dependent_clause_rate"
-"prop_nouns_with_det"
-"prop_nouns_with_adj"
-"num_noun_phrases"
-"noun_phrase_rate"
-"num_verb_phrases"
-"verb_phrase_rate"
-"num_infinitive_phrases"
-"infinitive_phrase_rate"
-"num_prepositional_phrases"
-"prepositional_phrase_rate"
-"max_yngve_depth"
-"mean_yngve_depth"
-"total_yngve_depth"
-"parse_tree_height"
-"num_discourse_markers"
-"discourse_marker_rate"
+  - "num_pauses"
+  - "total_pause_time"
+  - "mean_pause_duration"
+  - "between_utterance_pause_duration"
+  - "hesitation_ratio"
+  - "speech_rate"
+  - "maximum_speech_rate"
+  - "total_phonation_time"
+  - "std_phonation_time"
+  - "total_locution_time"
+  - "adjective_rate"
+  - "adposition_rate"
+  - "adverb_rate"
+  - "auxiliary_rate"
+  - "determiner_rate"
+  - "interjection_rate"
+  - "noun_rate"
+  - "numeral_rate"
+  - "particle_rate"
+  - "pronoun_rate"
+  - "proper_noun_rate"
+  - "punctuation_rate"
+  - "subordinating_conjunction_rate"
+  - "symbol_rate"
+  - "verb_rate"
+  - "demonstrative_rate"
+  - "conjunction_rate"
+  - "possessive_rate"
+  - "noun_verb_ratio"
+  - "noun_ratio"
+  - "pronoun_noun_ratio"
+  - "total_dependency_distance"
+  - "average_dependency_distance"
+  - "total_dependencies"
+  - "average_dependencies"
+  - "closed_class_word_rate"
+  - "open_class_word_rate"
+  - "content_density"
+  - "idea_density"
+  - "honore_statistic"
+  - "brunet_index"
+  - "type_token_ratio"
+  - "word_length"
+  - "prop_inflected_verbs"
+  - "prop_auxiliary_verbs"
+  - "prop_gerund_verbs"
+  - "prop_participles"
+  - "num_noun_phrases"
+  - "noun_phrase_rate"
+  - "num_verb_phrases"
+  - "verb_phrase_rate"
+  - "num_prepositional_phrases"
+  - "prepositional_phrase_rate"
+  - "num_clauses"
+  - "clause_rate"
+  - "num_infinitive_phrases"
+  - "infinitive_phrase_rate"
+  - "num_dependent_clauses"
+  - "dependent_clause_rate"
+  - "prop_nouns_with_det"
+  - "prop_nouns_with_adj"
+  - "max_yngve_depth"
+  - "mean_yngve_depth"
+  - "total_yngve_depth"
+  - "parse_tree_height"
+  - "num_discourse_markers"
+  - "discourse_marker_rate"
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name="blabla",
-    version="0.1",
+    version="0.2.1",
     description="Novoic linguistics feature extraction package.",
     url="http://github.com/novoic/BlaBla",
     author="Abhishek Shivkumar",
@@ -28,12 +28,11 @@
         "alzheimers-disease",
         "parkinsons-disease",
     ],
-    download_url="https://github.com/novoic/blabla/archive/v0.1.tar.gz",
+    download_url="https://github.com/novoic/blabla/archive/v0.2.1.tar.gz",
     install_requires=[
         "stanza==1.0.0",
         "flask==1.1.2",
         "jsonpickle==1.4",
-        "pyyaml==5.3.1",
         "anytree==2.8.0",
         "nltk==3.5",
         "ipython==7.13.0",
@@ -47,10 +46,10 @@
     scripts=["bin/blabla"],
     zip_safe=False,
     classifiers=[
-    'Development Status :: 4 - Beta',     
-    'Intended Audience :: Healthcare Industry',     
+    'Development Status :: 4 - Beta',
+    'Intended Audience :: Healthcare Industry',
     'Topic :: Scientific/Engineering :: Bio-Informatics',
-    'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 
+    'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
     'Programming Language :: Python :: 3.6',
     'Programming Language :: Python :: 3.7',
   ],