dhmit · Nisha-Nathan · Oct 15, 2021 · Oct 22, 2021 · Oct 22, 2021 · Oct 29, 2021
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/temp.iml b/.idea/temp.iml
diff --git a/backend/app/POST.py b/backend/app/POST.py
@@ -0,0 +1,18 @@
+
+
+import csv
+from app.services import parse_csv
+
+
+@api_view(['POST'])
+def upload_doc(request):
+    """
+    API endpoint for uploading csv files that are to be converted to an instance of csv_reader
+    """
+    file=request.data
+    #inserts if condition?(to check whether input data is csv)
+    csv_reader = csv.DictReader(file)
+    x= parse_csv(csv_reader)#returns None because parse_csv doesn't return anything
+    return x
+
+
diff --git a/backend/app/analysis/auto_instances.py b/backend/app/analysis/auto_instances.py
@@ -0,0 +1,46 @@
+from app.models import Document, Corpus
+import os
+import csv
+from app.services.parse_csv import parse_csv
+
+def create_instances():
+    directory = "C:\\Users\\Ayden\\Documents\\GitHub\\gender_analysis_web\\backend\\app\\analysis\\small_talks"
+    # Make the line above shorter at some point
+    documents = os.listdir(directory)
+    print(documents)
+    docList = []
+    for doc in documents:
+        with open(f"{directory}\\{doc}") as newDoc:
+            for line in newDoc:
+                docList.append(Document(title=doc.replace(".txt", ""), text=line))
+                #print(f"{docList[-1].title}: {docList[-1].text}")
+
+    return docList
+
+def create_instances_new():
+    directory = "C:\\Users\\Ayden\\Documents\\GitHub\\gender_analysis_web\\backend\\app\\analysis\\small_talks"
+    # Make the line above shorter at some point
+    documents = os.listdir(directory)
+
+    for doc in documents:
+        with open(f"{directory}\\{doc}") as newDoc:
+            for line in newDoc:
+                doc_title = doc.replace(".txt", "")
+                if Document.objects.filter(title=doc_title).count() == 0:   #If the doc isn't already an instance in the Document model:
+                    new_doc = Document.objects.create_document(title=doc_title, text=line)
+                    new_doc.save()
+                #print(new_doc.title)
+                #print(f"{docList[-1].title}: {docList[-1].text}")
+
+    #return Documents.objects.all()
+
+def create_instances_newer(csv_filename):
+    current_path = os.path.dirname(os.path.abspath(__file__))  # returns path of auto_instances.py
+    in_csv = os.path.join(current_path, csv_filename)  # appends small_talks.csv to the above
+
+    with open(in_csv, encoding='utf-8') as f:
+        csv_reader = csv.DictReader(f)
+        return parse_csv(csv_reader, csv_filename)
+
+def main():
+    create_instances_newer("small_talks.csv")
diff --git a/backend/app/analysis/test.py b/backend/app/analysis/test.py
@@ -0,0 +1,49 @@
+import csv
+import os
+
+current_path = os.path.dirname(os.path.abspath(__file__)) # returns path of test.py
+test_csv = os.path.join(current_path, 'small_talks.csv') # appends test.csv to the above
+
+def parse_csv(path_to_files):
+    # create directory name and loop each csv creating txt file from  'transcript' column
+    path_to_folder = os.path.splitext(path_to_files)[0]
+    print(f"Path: {path_to_folder}")
+    if not os.path.exists(path_to_folder):
+        os.mkdir(path_to_folder)
+    print(f"Path to files: {path_to_files}")
+    with open(path_to_files, encoding='utf-8') as test_data_csv:
+        csv_reader = csv.DictReader(test_data_csv)
+        for data in csv_reader:
+            # filename = ''.join(filter(str.isalnum, data['title'])) + '.txt'
+            print(data)
+            #print("bup")
+            filename = f"{data['title']}.txt"
+            for c in filename:  #remove any invalid characters
+                if c in "?":
+                    filename = filename.replace(c, "")
+
+            text = data['transcript']
+            #speaker = data['speaker_1']
+            #year = data['recorded_date'][:4]
+
+            with open(os.path.join(path_to_folder, filename), "w", encoding='utf8') as f:
+                #f.write(f"By {speaker}\nYear recorded: {year}\n----------\n{text}")
+                f.write(text)
+    return path_to_folder
+
+def test_function():
+    print(test_csv)
+    with open(test_csv, encoding='utf-8') as test_data:
+        test_csv_reader = csv.DictReader(test_data)
+        for row in test_csv_reader:
+            print(row)
+
+#def convert_ted_talks_csv(csv_path):
+#    with open(csv_path, encoding='utf-8') as csv_file:
+#        reader = csv.reader(csv_file)
+#        for line in reader:
+#            print(line)
+
+
+if __name__ == '__main__':
+    parse_csv(test_csv)
diff --git a/backend/app/services/parse_csv.py b/backend/app/services/parse_csv.py
@@ -0,0 +1,37 @@
+from app.models import Document, Corpus
+
+def parse_csv(csv_reader, title):
+    doc_list = []
+    for data in csv_reader:
+        new_doc = None
+        if Document.objects.filter(title=data['title']).count() == 0:  # If the data isn't already an instance in the Document model:
+            new_doc = Document.objects.create_document(title=data['title'], text=data['transcript'], author=data['speaker_1'], year=data['recorded_date'][:4])
+            new_doc.save()
+        else:
+            new_doc = Document.objects.filter(title=data['title'])[0]
+        doc_list.append(new_doc)
+
+    # Use the id as a unique indentifier instead of the title (or maybe remove this check entirely)
+    if Corpus.objects.filter(title=f"{title} Corpus").count() == 0:  # If the corpus doesn't exist...
+        new_corpus = Corpus(title=f"{title} Corpus")
+        new_corpus.save()
+    else:
+        new_corpus = Corpus.objects.filter(title=f"{title} Corpus")[0]
+
+    new_corpus.documents.set(doc_list)
+    new_corpus.save()
+    print(Corpus.objects.all())
+    print("Documents: ")
+    for doc in new_corpus:
+        print(doc.title)
+
+    return new_corpus
+
+    tt_corpus.documents.set(doc_list)
+    tt_corpus.save()
+    #print(Corpus.objects.all())
+    #print("Documents: ")
+    #for doc in tt_corpus:
+       # print(doc.title)
+
+    return tt_corpus
diff --git a/backend/app/views.py b/backend/app/views.py
@@ -21,6 +21,8 @@
 }
 """
 import json
+import csv
+
 
 from rest_framework.decorators import api_view
 from rest_framework.response import Response
@@ -36,7 +38,8 @@
     GenderSerializer,
     CorpusSerializer
 )
-
+from app.analysis.auto_instances import create_instances_newer
+from app.services.parse_csv import parse_csv
 
 @api_view(['GET'])
 def get_example(request, example_id):
@@ -102,6 +105,7 @@ def add_document(request):
     """
     API endpoint for adding a piece of document
     """
+    print("we are in add_document")
     attributes = request.data
     new_attributes = {}
     for attribute in attributes['newAttributes']:
@@ -129,6 +133,19 @@ def all_documents(request):
     serializer = SimpleDocumentSerializer(doc_objs, many=True)
     return Response(serializer.data)
 
+@api_view(['POST'])
+def upload_document(request):
+    """
+    API endpoint for uploading csv files that are to be converted to an instance of csv_reader
+    """
+
+    file = request.data["filename"]
+
+    content = file.read().decode('utf-8').splitlines()
+    csv_reader = csv.DictReader(content)
+    new_corpus = parse_csv(csv_reader)
+    serializer = CorpusSerializer(new_corpus)
+    return Response(serializer.data)
 
 @api_view(['GET'])
 def get_document(request, doc_id):
@@ -211,6 +228,16 @@ def update_corpus_docs(request):
     serializer = CorpusSerializer(corpus_obj)
     return Response(serializer.data)
 
+@api_view(['POST'])
+def create_corpus_csv(request):
+    """
+    API endpoint for converting a csv file into a corpus
+    """
+    #                                                                                           bup
+    filename = request.data["csv"]
+    return_corpus = create_instances_newer(filename)
+    serializer = CorpusSerializer(return_corpus)
+    return Response(serializer.data)
 
 @api_view(['DELETE'])
 def delete_corpus(request):

diff --git a/backend/config/urls.py b/backend/config/urls.py
@@ -35,6 +35,8 @@
     path('api/update_corpus_docs', views.update_corpus_docs),
     path('api/delete_corpus', views.delete_corpus),
     path('api/corpus/<int:corpus_id>', views.get_corpus),
+    path('api/corpus_from_csv/', views.create_corpus_csv),
+    path('api/upload_document', views.upload_document),
 
     # View paths
     path('', views.index, name='index'),
@@ -43,5 +45,6 @@
     path('documents', views.documents, name='documents'),
     path('document/<int:doc_id>', views.single_document, name='document'),
     path('corpora', views.corpora, name='corpora'),
-    path('corpus/<int:corpus_id>', views.corpus, name='corpus')
+    path('corpus/<int:corpus_id>', views.corpus, name='corpus'),
+
 ]