Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import csv #63

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion .idea/temp.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions backend/app/POST.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@


import csv
from app.services import parse_csv


@api_view(['POST'])
def upload_doc(request):
"""
API endpoint for uploading csv files that are to be converted to an instance of csv_reader
"""
file=request.data
#inserts if condition?(to check whether input data is csv)
csv_reader = csv.DictReader(file)
x= parse_csv(csv_reader)#returns None because parse_csv doesn't return anything
return x


Nisha-Nathan marked this conversation as resolved.
Show resolved Hide resolved
46 changes: 46 additions & 0 deletions backend/app/analysis/auto_instances.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from app.models import Document, Corpus
import os
import csv
from app.services.parse_csv import parse_csv

def create_instances():
directory = "C:\\Users\\Ayden\\Documents\\GitHub\\gender_analysis_web\\backend\\app\\analysis\\small_talks"
# Make the line above shorter at some point
documents = os.listdir(directory)
print(documents)
docList = []
for doc in documents:
with open(f"{directory}\\{doc}") as newDoc:
for line in newDoc:
docList.append(Document(title=doc.replace(".txt", ""), text=line))
#print(f"{docList[-1].title}: {docList[-1].text}")

return docList

def create_instances_new():
directory = "C:\\Users\\Ayden\\Documents\\GitHub\\gender_analysis_web\\backend\\app\\analysis\\small_talks"
# Make the line above shorter at some point
documents = os.listdir(directory)

for doc in documents:
with open(f"{directory}\\{doc}") as newDoc:
for line in newDoc:
doc_title = doc.replace(".txt", "")
if Document.objects.filter(title=doc_title).count() == 0: #If the doc isn't already an instance in the Document model:
new_doc = Document.objects.create_document(title=doc_title, text=line)
new_doc.save()
#print(new_doc.title)
#print(f"{docList[-1].title}: {docList[-1].text}")

#return Documents.objects.all()

def create_instances_newer(csv_filename):
current_path = os.path.dirname(os.path.abspath(__file__)) # returns path of auto_instances.py
in_csv = os.path.join(current_path, csv_filename) # appends small_talks.csv to the above

with open(in_csv, encoding='utf-8') as f:
csv_reader = csv.DictReader(f)
return parse_csv(csv_reader, csv_filename)

def main():
create_instances_newer("small_talks.csv")
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
49 changes: 49 additions & 0 deletions backend/app/analysis/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import csv
import os

current_path = os.path.dirname(os.path.abspath(__file__)) # returns path of test.py
test_csv = os.path.join(current_path, 'small_talks.csv') # appends test.csv to the above

def parse_csv(path_to_files):
# create directory name and loop each csv creating txt file from 'transcript' column
path_to_folder = os.path.splitext(path_to_files)[0]
print(f"Path: {path_to_folder}")
if not os.path.exists(path_to_folder):
os.mkdir(path_to_folder)
print(f"Path to files: {path_to_files}")
with open(path_to_files, encoding='utf-8') as test_data_csv:
csv_reader = csv.DictReader(test_data_csv)
for data in csv_reader:
# filename = ''.join(filter(str.isalnum, data['title'])) + '.txt'
print(data)
#print("bup")
filename = f"{data['title']}.txt"
for c in filename: #remove any invalid characters
if c in "?":
filename = filename.replace(c, "")

text = data['transcript']
#speaker = data['speaker_1']
#year = data['recorded_date'][:4]

with open(os.path.join(path_to_folder, filename), "w", encoding='utf8') as f:
#f.write(f"By {speaker}\nYear recorded: {year}\n----------\n{text}")
f.write(text)
return path_to_folder

def test_function():
print(test_csv)
with open(test_csv, encoding='utf-8') as test_data:
test_csv_reader = csv.DictReader(test_data)
for row in test_csv_reader:
print(row)

#def convert_ted_talks_csv(csv_path):
# with open(csv_path, encoding='utf-8') as csv_file:
# reader = csv.reader(csv_file)
# for line in reader:
# print(line)


if __name__ == '__main__':
parse_csv(test_csv)
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
37 changes: 37 additions & 0 deletions backend/app/services/parse_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from app.models import Document, Corpus

def parse_csv(csv_reader, title):
doc_list = []
for data in csv_reader:
new_doc = None
if Document.objects.filter(title=data['title']).count() == 0: # If the data isn't already an instance in the Document model:
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
new_doc = Document.objects.create_document(title=data['title'], text=data['transcript'], author=data['speaker_1'], year=data['recorded_date'][:4])
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
new_doc.save()
else:
new_doc = Document.objects.filter(title=data['title'])[0]
doc_list.append(new_doc)

# Use the id as a unique indentifier instead of the title (or maybe remove this check entirely)
if Corpus.objects.filter(title=f"{title} Corpus").count() == 0: # If the corpus doesn't exist...
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
new_corpus = Corpus(title=f"{title} Corpus")
new_corpus.save()
else:
new_corpus = Corpus.objects.filter(title=f"{title} Corpus")[0]

new_corpus.documents.set(doc_list)
new_corpus.save()
print(Corpus.objects.all())
print("Documents: ")
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
for doc in new_corpus:
print(doc.title)
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved

return new_corpus

tt_corpus.documents.set(doc_list)
tt_corpus.save()
#print(Corpus.objects.all())
#print("Documents: ")
#for doc in tt_corpus:
# print(doc.title)

return tt_corpus
ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
29 changes: 28 additions & 1 deletion backend/app/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
}
"""
import json
import csv


from rest_framework.decorators import api_view
from rest_framework.response import Response
Expand All @@ -36,7 +38,8 @@
GenderSerializer,
CorpusSerializer
)

from app.analysis.auto_instances import create_instances_newer
Nisha-Nathan marked this conversation as resolved.
Show resolved Hide resolved
from app.services.parse_csv import parse_csv

@api_view(['GET'])
def get_example(request, example_id):
Expand Down Expand Up @@ -102,6 +105,7 @@ def add_document(request):
"""
API endpoint for adding a piece of document
"""
print("we are in add_document")
Nisha-Nathan marked this conversation as resolved.
Show resolved Hide resolved
attributes = request.data
new_attributes = {}
for attribute in attributes['newAttributes']:
Expand Down Expand Up @@ -129,6 +133,19 @@ def all_documents(request):
serializer = SimpleDocumentSerializer(doc_objs, many=True)
return Response(serializer.data)

@api_view(['POST'])
def upload_document(request):
"""
API endpoint for uploading csv files that are to be converted to an instance of csv_reader
"""

file = request.data["filename"]
Nisha-Nathan marked this conversation as resolved.
Show resolved Hide resolved

content = file.read().decode('utf-8').splitlines()
csv_reader = csv.DictReader(content)
new_corpus = parse_csv(csv_reader)
serializer = CorpusSerializer(new_corpus)
return Response(serializer.data)

@api_view(['GET'])
def get_document(request, doc_id):
Expand Down Expand Up @@ -211,6 +228,16 @@ def update_corpus_docs(request):
serializer = CorpusSerializer(corpus_obj)
return Response(serializer.data)

@api_view(['POST'])
def create_corpus_csv(request):
"""
API endpoint for converting a csv file into a corpus
"""
# bup
filename = request.data["csv"]
return_corpus = create_instances_newer(filename)
serializer = CorpusSerializer(return_corpus)
return Response(serializer.data)
Nisha-Nathan marked this conversation as resolved.
Show resolved Hide resolved

@api_view(['DELETE'])
def delete_corpus(request):
Expand Down
5 changes: 4 additions & 1 deletion backend/config/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
path('api/update_corpus_docs', views.update_corpus_docs),
path('api/delete_corpus', views.delete_corpus),
path('api/corpus/<int:corpus_id>', views.get_corpus),
path('api/corpus_from_csv/', views.create_corpus_csv),
path('api/upload_document', views.upload_document),
Nisha-Nathan marked this conversation as resolved.
Show resolved Hide resolved

# View paths
path('', views.index, name='index'),
Expand All @@ -43,5 +45,6 @@
path('documents', views.documents, name='documents'),
path('document/<int:doc_id>', views.single_document, name='document'),
path('corpora', views.corpora, name='corpora'),
path('corpus/<int:corpus_id>', views.corpus, name='corpus')
path('corpus/<int:corpus_id>', views.corpus, name='corpus'),

ADJohnson314 marked this conversation as resolved.
Show resolved Hide resolved
]