Merge pull request #25 from dhmit/transfer_frequency_module_PR1

Transfer frequency module pr1
dhmit · Jul 23, 2021 · 0fdfcc6 · 0fdfcc6
2 parents 1276da0 + b10cfa9
commit 0fdfcc6
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 1 deletion.
diff --git a/backend/app/analysis/frequency.py b/backend/app/analysis/frequency.py
@@ -0,0 +1,87 @@
+from collections import Counter
+from ..models import (
+    Corpus,
+    Document,
+    Gender
+)
+
+
+def _get_gender_word_frequencies_relative(gender_word_counts):
+    """
+    A private helper function that examines identifier counts keyed to Gender instances,
+    determines the total count value of all identifiers across Gender instances,
+    and returns the percentage of each identifier count over the total count.
+
+    :param gender_word_counts: a dictionary keying gender instances to string identifiers keyed to
+                               integer counts.
+    :return: a dictionary with the integer counts transformed into float values representing
+             the identifier count as a percentage of the total identifier counts across all
+             identifiers.
+    """
+
+    output = {}
+    total_word_count = 0
+    for gender in gender_word_counts:
+        for word in gender_word_counts[gender]:
+            total_word_count += gender_word_counts[gender][word]
+
+    for gender in gender_word_counts:
+        output[gender] = {}
+        for word, original_count in gender_word_counts[gender].items():
+            try:
+                frequency = original_count / total_word_count
+            except ZeroDivisionError:
+                frequency = 0
+            output[gender][word] = frequency
+
+    return output
+
+
+def run_single_analysis(doc_obj, genders):
+    """
+    This method generates a dictionary that includes a Counter (count) that keys
+    Document instances to Gender instances to Counter instances representing the total
+    number of instances of each Gender's pronouns in a given Document, a dictionary (frequency)
+    keying Document instances to Gender instances to dictionaries of the shape {str:float}
+    representing the total number of instances of each Gender's pronouns over the total word count
+    of that Document; and a dictionary (relative) keying Document instances to Gender instances
+    to dictionaries of the shape {str:float} representing the relative percentage of Gender
+    pronouns across all Gender instances in a given Document instance.
+
+    :param doc_obj: an instance of the Document model
+    :param genders: a list of Gender objects
+    :return: a dictionary containing the frequency analyses of the Document instance
+    """
+    count = Counter()
+    frequency = {}
+
+    for gender in genders:
+        count[gender] = doc_obj.get_count_of_words(gender.pronouns)
+        frequency[gender] = doc_obj.get_word_freqs(gender.pronouns)
+    relative = _get_gender_word_frequencies_relative(count)
+
+    output = {
+        'count': count,
+        'frequency': frequency,
+        'relative': relative
+    }
+
+    return output
+
+
+def run_analysis(corpus_id, gender_ids):
+    """
+        This method generates a dictionary of dictionaries for each Document instance in the Corpus.
+        Each dictionary maps the type of frequency analysis (count, frequency, relative) to the
+        analysis itself.
+
+        :param corpus_id: the ID of a Corpus instance
+        :param gender_ids: a list of integers representing Gender primary keys
+        :return: a dictionary mapping the Document IDs to the frequency analyses of the Document instance
+    """
+    results = {}
+    genders = Gender.objects.filter(id__in=gender_ids)
+    doc_ids = Corpus.objects.filter(pk=corpus_id).values_list('documents__pk', flat=True)
+    for pk in doc_ids:
+        results[pk] = run_single_analysis(Document.objects.get(id=pk), genders)
+    return results
diff --git a/backend/app/tests.py b/backend/app/tests.py
@@ -12,7 +12,10 @@
     Corpus,
     Gender,
 )
-from .analysis import proximity
+from .analysis import (
+    proximity,
+    frequency
+)
 
 
 class PronounSeriesTestCase(TestCase):
@@ -197,6 +200,75 @@ def test_update_metadata(self):
         self.assertEqual(doc.word_count, 9)
 
 
+class FrequencyTestCase(TestCase):
+    """
+    Test cases for the frequency analysis
+    """
+    def setUp(self):
+        text1 = """She took a lighter out of her purse and handed it over to him.
+            He lit his cigarette and took a deep drag from it, and then began
+            his speech which ended in a proposal. Her tears drowned the ring."""
+        Document.objects.create_document(title='doc1', year=2021, text=text1)
+        Corpus.objects.create(title='corpus1')
+        Corpus.objects.get(title='corpus1').documents.add(Document.objects.get(title='doc1'))
+
+    def test_single_frequency(self):
+        doc1 = Document.objects.get(title='doc1')
+        male = Gender.objects.get(pk=1, label='Male')
+        female = Gender.objects.get(pk=2, label='Female')
+        nonbinary = Gender.objects.get(pk=3, label='Nonbinary')
+        result = frequency.run_single_analysis(doc1, [male, female, nonbinary])
+        expected = {
+            'count': Counter({
+                male: Counter({'his': 2, 'him': 1, 'he': 1, 'himself': 0}),
+                female: Counter({'her': 2, 'she': 1, 'herself': 0, 'hers': 0}),
+                nonbinary: Counter({'theirs': 0, 'themself': 0, 'them': 0, 'their': 0, 'they': 0})}),
+            'frequency': {
+                male: {'his': 0.05, 'him': 0.025, 'he': 0.025, 'himself': 0.0},
+                female: {'herself': 0.0, 'she': 0.025, 'her': 0.05, 'hers': 0.0},
+                nonbinary: {'theirs': 0.0, 'themself': 0.0, 'them': 0.0, 'their': 0.0, 'they': 0.0}},
+            'relative': {
+                male: {
+                    'his': 0.2857142857142857,
+                    'him': 0.14285714285714285,
+                    'he': 0.14285714285714285,
+                    'himself': 0.0},
+                female: {
+                    'herself': 0.0,
+                    'she': 0.14285714285714285,
+                    'her': 0.2857142857142857, 'hers': 0.0},
+                nonbinary: {'theirs': 0.0, 'themself': 0.0, 'them': 0.0, 'their': 0.0, 'they': 0.0}}}
+        self.assertEqual(result, expected)
+
+
+    def test_run_analysis(self):
+        result = frequency.run_analysis(1, [1, 2, 3])
+        male = Gender.objects.get(pk=1, label='Male')
+        female = Gender.objects.get(pk=2, label='Female')
+        nonbinary = Gender.objects.get(pk=3, label='Nonbinary')
+        expected = {
+            1: {'count': Counter({
+                    male: Counter({'his': 2, 'him': 1, 'he': 1, 'himself': 0}),
+                    female: Counter({'her': 2, 'she': 1, 'herself': 0, 'hers': 0}),
+                    nonbinary: Counter({'theirs': 0, 'themself': 0, 'them': 0, 'their': 0, 'they': 0})}),
+                'frequency': {
+                    male: {'his': 0.05, 'him': 0.025, 'he': 0.025, 'himself': 0.0},
+                    female: {'herself': 0.0, 'she': 0.025, 'her': 0.05, 'hers': 0.0},
+                    nonbinary: {'theirs': 0.0, 'themself': 0.0, 'them': 0.0, 'their': 0.0, 'they': 0.0}},
+                'relative': {
+                    male: {
+                        'his': 0.2857142857142857,
+                        'him': 0.14285714285714285,
+                        'he': 0.14285714285714285,
+                        'himself': 0.0},
+                    female: {
+                        'herself': 0.0,
+                        'she': 0.14285714285714285,
+                        'her': 0.2857142857142857, 'hers': 0.0},
+                    nonbinary: {'theirs': 0.0, 'themself': 0.0, 'them': 0.0, 'their': 0.0, 'they': 0.0}}}}
+        self.assertEqual(result, expected)
+
+
 class CorpusTestCase(TestCase):
     """
     Test Cases for the Corpus Model