diff --git a/README.md b/README.md index f9586b2..755b865 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc: content = open('example_configs/example_document.txt').read() doc = doc_proc.analyze(content, 'string') -res = doc.compute_features('noun_rate') +res = doc.compute_features(['noun_rate']) print(res) ``` @@ -90,7 +90,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc: content = open('example_configs/example_document.txt').read() doc = doc_proc.analyze(content, 'string') -res = doc.compute_features('noun_rate') +res = doc.compute_features(['noun_rate', 'verb_rate']) print(res) ``` @@ -104,7 +104,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc: content = open('example_configs/example_document.json').read() doc = doc_proc.analyze(content, 'json') -res = doc.compute_features('speech_rate') +res = doc.compute_features(['speech_rate']) print(res) ``` diff --git a/bin/blabla b/bin/blabla index f7f5744..d128d19 100644 --- a/bin/blabla +++ b/bin/blabla @@ -35,7 +35,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path input_format (str): The format of the input (string / json) Returns: - None: + None: """ output_file = open(output_file_path, 'w') features_json = yaml.load(open(features_yaml_path, 'r')) @@ -54,7 +54,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path with open(os.path.join(input_dir_path, input_fp)) as f: content = f.read() doc = doc_proc.analyze(content, input_format) - res_json = doc.compute_features(*features_json['features']) + res_json = doc.compute_features(features_json['features']) res = {} res = {key: [val] for key, val in res_json.items()} df = df.append(pd.DataFrame.from_dict(res)) @@ -63,7 +63,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path if __name__ == "__main__": """The main Command Line Interface to BlaBla. We provide the path to the features YAML file, path to the - input file and a path to the output csv file that will contain the output features. We also + input file and a path to the output csv file that will contain the output features. We also specify the input format which is either a string or a json. Args: diff --git a/blabla/document_engine.py b/blabla/document_engine.py index af89d70..14bd64f 100644 --- a/blabla/document_engine.py +++ b/blabla/document_engine.py @@ -341,18 +341,6 @@ def auxiliary_rate(self, **kwargs): """ return self._extract_lexico_semantic_features('auxiliary_rate', **kwargs)['auxiliary_rate'] - def conjuction_rate(self, **kwargs): - """Extract the conjuction rate. - Ref: https://pubmed.ncbi.nlm.nih.gov/28321196/ - - Args: - kwargs (list): Optional arguments for threshold values - - Returns: - The conjuction rate across all sentence objects - """ - return self._extract_lexico_semantic_features('conjuction_rate', **kwargs)['conjuction_rate'] - def determiner_rate(self, **kwargs): """Extract the determiner rate. Ref: https://pubmed.ncbi.nlm.nih.gov/28321196/ diff --git a/example_configs/features.yaml b/example_configs/features.yaml index a99a3dc..d40088c 100644 --- a/example_configs/features.yaml +++ b/example_configs/features.yaml @@ -1,76 +1,69 @@ language: "en" features: - - -"num_pauses" -"total_pause_time" -"mean_pause_duration" -"between_utterance_pause_duration" -"pause_between_utterance_duration" -"pause_duration" -"hesitation_ratio" -"pause_duration_for_hesitation" -"speech_rate" -"maximum_speech_rate" -"num_rapid_sentences" -"total_phonation_time" -"standardized_phonation_time" -"total_locution_time" -"noun_rate" -"verb_rate" -"demonstrative_rate" -"adjective_rate" -"adposition_rate" -"adverb_rate" -"auxiliary_rate" -"conjunction_rate" -"determiner_rate" -"injection_rate" -"numeral_rate" -"particle_rate" -"pronoun_rate" -"proper_noun_rate" -"punctutation_rate" -"subordinating_conjunction_rate" -"symbol_rate" -"possessive_rate" -"noun_verb_ratio" -"noun_ratio" -"pronoun_noun_ratio" -"closed_class_word_rate" -"open_class_word_rate" -"total_dependency_distance" -"average_dependency_distance" -"total_dependency_distance" -"total_dependencies" -"average_dependencies" -"content_density" -"idea_density" -"honore_statistic" -"brunet_index" -"type_token_ratio" -"word_length" -"prop_inflected_verbs" -"prop_auxiliary_verbs" -"prop_gerund_verbs" -"prop_participles" -"num_clauses" -"clause_rate" -"num_dependent_clauses" -"dependent_clause_rate" -"prop_nouns_with_det" -"prop_nouns_with_adj" -"num_noun_phrases" -"noun_phrase_rate" -"num_verb_phrases" -"verb_phrase_rate" -"num_infinitive_phrases" -"infinitive_phrase_rate" -"num_prepositional_phrases" -"prepositional_phrase_rate" -"max_yngve_depth" -"mean_yngve_depth" -"total_yngve_depth" -"parse_tree_height" -"num_discourse_markers" -"discourse_marker_rate" + - "num_pauses" + - "total_pause_time" + - "mean_pause_duration" + - "between_utterance_pause_duration" + - "hesitation_ratio" + - "speech_rate" + - "maximum_speech_rate" + - "total_phonation_time" + - "std_phonation_time" + - "total_locution_time" + - "adjective_rate" + - "adposition_rate" + - "adverb_rate" + - "auxiliary_rate" + - "determiner_rate" + - "interjection_rate" + - "noun_rate" + - "numeral_rate" + - "particle_rate" + - "pronoun_rate" + - "proper_noun_rate" + - "punctuation_rate" + - "subordinating_conjunction_rate" + - "symbol_rate" + - "verb_rate" + - "demonstrative_rate" + - "conjunction_rate" + - "possessive_rate" + - "noun_verb_ratio" + - "noun_ratio" + - "pronoun_noun_ratio" + - "total_dependency_distance" + - "average_dependency_distance" + - "total_dependencies" + - "average_dependencies" + - "closed_class_word_rate" + - "open_class_word_rate" + - "content_density" + - "idea_density" + - "honore_statistic" + - "brunet_index" + - "type_token_ratio" + - "word_length" + - "prop_inflected_verbs" + - "prop_auxiliary_verbs" + - "prop_gerund_verbs" + - "prop_participles" + - "num_noun_phrases" + - "noun_phrase_rate" + - "num_verb_phrases" + - "verb_phrase_rate" + - "num_prepositional_phrases" + - "prepositional_phrase_rate" + - "num_clauses" + - "clause_rate" + - "num_infinitive_phrases" + - "infinitive_phrase_rate" + - "num_dependent_clauses" + - "dependent_clause_rate" + - "prop_nouns_with_det" + - "prop_nouns_with_adj" + - "max_yngve_depth" + - "mean_yngve_depth" + - "total_yngve_depth" + - "parse_tree_height" + - "num_discourse_markers" + - "discourse_marker_rate" diff --git a/setup.py b/setup.py index 7fc2080..ca3696b 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name="blabla", - version="0.1", + version="0.2.1", description="Novoic linguistics feature extraction package.", url="http://github.com/novoic/BlaBla", author="Abhishek Shivkumar", @@ -28,12 +28,11 @@ "alzheimers-disease", "parkinsons-disease", ], - download_url="https://github.com/novoic/blabla/archive/v0.1.tar.gz", + download_url="https://github.com/novoic/blabla/archive/v0.2.1.tar.gz", install_requires=[ "stanza==1.0.0", "flask==1.1.2", "jsonpickle==1.4", - "pyyaml==5.3.1", "anytree==2.8.0", "nltk==3.5", "ipython==7.13.0", @@ -47,10 +46,10 @@ scripts=["bin/blabla"], zip_safe=False, classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Healthcare Industry', + 'Development Status :: 4 - Beta', + 'Intended Audience :: Healthcare Industry', 'Topic :: Scientific/Engineering :: Bio-Informatics', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', + 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ],