Skip to content

Commit

Permalink
Merge pull request #20 from novoic/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
abhisheknovoic authored Jul 23, 2020
2 parents 2315b7e + 58f6c44 commit 9fb31ea
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 98 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc:
content = open('example_configs/example_document.txt').read()
doc = doc_proc.analyze(content, 'string')

res = doc.compute_features('noun_rate')
res = doc.compute_features(['noun_rate'])
print(res)
```

Expand Down Expand Up @@ -90,7 +90,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc:
content = open('example_configs/example_document.txt').read()
doc = doc_proc.analyze(content, 'string')

res = doc.compute_features('noun_rate')
res = doc.compute_features(['noun_rate', 'verb_rate'])
print(res)
```

Expand All @@ -104,7 +104,7 @@ with DocumentProcessor('stanza_config/stanza_config.yaml', 'en') as doc_proc:
content = open('example_configs/example_document.json').read()
doc = doc_proc.analyze(content, 'json')

res = doc.compute_features('speech_rate')
res = doc.compute_features(['speech_rate'])
print(res)
```

Expand Down
6 changes: 3 additions & 3 deletions bin/blabla
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path
input_format (str): The format of the input (string / json)
Returns:
None:
None:
"""
output_file = open(output_file_path, 'w')
features_json = yaml.load(open(features_yaml_path, 'r'))
Expand All @@ -54,7 +54,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path
with open(os.path.join(input_dir_path, input_fp)) as f:
content = f.read()
doc = doc_proc.analyze(content, input_format)
res_json = doc.compute_features(*features_json['features'])
res_json = doc.compute_features(features_json['features'])
res = {}
res = {key: [val] for key, val in res_json.items()}
df = df.append(pd.DataFrame.from_dict(res))
Expand All @@ -63,7 +63,7 @@ def compute_features(features_yaml_path, stanza_config_file_path, input_dir_path

if __name__ == "__main__":
"""The main Command Line Interface to BlaBla. We provide the path to the features YAML file, path to the
input file and a path to the output csv file that will contain the output features. We also
input file and a path to the output csv file that will contain the output features. We also
specify the input format which is either a string or a json.
Args:
Expand Down
12 changes: 0 additions & 12 deletions blabla/document_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,18 +341,6 @@ def auxiliary_rate(self, **kwargs):
"""
return self._extract_lexico_semantic_features('auxiliary_rate', **kwargs)['auxiliary_rate']

def conjuction_rate(self, **kwargs):
"""Extract the conjuction rate.
Ref: https://pubmed.ncbi.nlm.nih.gov/28321196/
Args:
kwargs (list): Optional arguments for threshold values
Returns:
The conjuction rate across all sentence objects
"""
return self._extract_lexico_semantic_features('conjuction_rate', **kwargs)['conjuction_rate']

def determiner_rate(self, **kwargs):
"""Extract the determiner rate.
Ref: https://pubmed.ncbi.nlm.nih.gov/28321196/
Expand Down
141 changes: 67 additions & 74 deletions example_configs/features.yaml
Original file line number Diff line number Diff line change
@@ -1,76 +1,69 @@
language: "en"
features:


"num_pauses"
"total_pause_time"
"mean_pause_duration"
"between_utterance_pause_duration"
"pause_between_utterance_duration"
"pause_duration"
"hesitation_ratio"
"pause_duration_for_hesitation"
"speech_rate"
"maximum_speech_rate"
"num_rapid_sentences"
"total_phonation_time"
"standardized_phonation_time"
"total_locution_time"
"noun_rate"
"verb_rate"
"demonstrative_rate"
"adjective_rate"
"adposition_rate"
"adverb_rate"
"auxiliary_rate"
"conjunction_rate"
"determiner_rate"
"injection_rate"
"numeral_rate"
"particle_rate"
"pronoun_rate"
"proper_noun_rate"
"punctutation_rate"
"subordinating_conjunction_rate"
"symbol_rate"
"possessive_rate"
"noun_verb_ratio"
"noun_ratio"
"pronoun_noun_ratio"
"closed_class_word_rate"
"open_class_word_rate"
"total_dependency_distance"
"average_dependency_distance"
"total_dependency_distance"
"total_dependencies"
"average_dependencies"
"content_density"
"idea_density"
"honore_statistic"
"brunet_index"
"type_token_ratio"
"word_length"
"prop_inflected_verbs"
"prop_auxiliary_verbs"
"prop_gerund_verbs"
"prop_participles"
"num_clauses"
"clause_rate"
"num_dependent_clauses"
"dependent_clause_rate"
"prop_nouns_with_det"
"prop_nouns_with_adj"
"num_noun_phrases"
"noun_phrase_rate"
"num_verb_phrases"
"verb_phrase_rate"
"num_infinitive_phrases"
"infinitive_phrase_rate"
"num_prepositional_phrases"
"prepositional_phrase_rate"
"max_yngve_depth"
"mean_yngve_depth"
"total_yngve_depth"
"parse_tree_height"
"num_discourse_markers"
"discourse_marker_rate"
- "num_pauses"
- "total_pause_time"
- "mean_pause_duration"
- "between_utterance_pause_duration"
- "hesitation_ratio"
- "speech_rate"
- "maximum_speech_rate"
- "total_phonation_time"
- "std_phonation_time"
- "total_locution_time"
- "adjective_rate"
- "adposition_rate"
- "adverb_rate"
- "auxiliary_rate"
- "determiner_rate"
- "interjection_rate"
- "noun_rate"
- "numeral_rate"
- "particle_rate"
- "pronoun_rate"
- "proper_noun_rate"
- "punctuation_rate"
- "subordinating_conjunction_rate"
- "symbol_rate"
- "verb_rate"
- "demonstrative_rate"
- "conjunction_rate"
- "possessive_rate"
- "noun_verb_ratio"
- "noun_ratio"
- "pronoun_noun_ratio"
- "total_dependency_distance"
- "average_dependency_distance"
- "total_dependencies"
- "average_dependencies"
- "closed_class_word_rate"
- "open_class_word_rate"
- "content_density"
- "idea_density"
- "honore_statistic"
- "brunet_index"
- "type_token_ratio"
- "word_length"
- "prop_inflected_verbs"
- "prop_auxiliary_verbs"
- "prop_gerund_verbs"
- "prop_participles"
- "num_noun_phrases"
- "noun_phrase_rate"
- "num_verb_phrases"
- "verb_phrase_rate"
- "num_prepositional_phrases"
- "prepositional_phrase_rate"
- "num_clauses"
- "clause_rate"
- "num_infinitive_phrases"
- "infinitive_phrase_rate"
- "num_dependent_clauses"
- "dependent_clause_rate"
- "prop_nouns_with_det"
- "prop_nouns_with_adj"
- "max_yngve_depth"
- "mean_yngve_depth"
- "total_yngve_depth"
- "parse_tree_height"
- "num_discourse_markers"
- "discourse_marker_rate"
11 changes: 5 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="blabla",
version="0.1",
version="0.2.1",
description="Novoic linguistics feature extraction package.",
url="http://github.com/novoic/BlaBla",
author="Abhishek Shivkumar",
Expand All @@ -28,12 +28,11 @@
"alzheimers-disease",
"parkinsons-disease",
],
download_url="https://github.com/novoic/blabla/archive/v0.1.tar.gz",
download_url="https://github.com/novoic/blabla/archive/v0.2.1.tar.gz",
install_requires=[
"stanza==1.0.0",
"flask==1.1.2",
"jsonpickle==1.4",
"pyyaml==5.3.1",
"anytree==2.8.0",
"nltk==3.5",
"ipython==7.13.0",
Expand All @@ -47,10 +46,10 @@
scripts=["bin/blabla"],
zip_safe=False,
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Healthcare Industry',
'Development Status :: 4 - Beta',
'Intended Audience :: Healthcare Industry',
'Topic :: Scientific/Engineering :: Bio-Informatics',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
],
Expand Down

0 comments on commit 9fb31ea

Please sign in to comment.