Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

validator added #116

Open
wants to merge 3 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 0 additions & 29 deletions .env.example

This file was deleted.

3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
config.env
all_data
vhosts
.env.example
# Ignore all json and csv files
*.json
*.csv
Expand All @@ -19,6 +20,8 @@ frontend/

# Env
.env*
.env.dev*
.env.example*
virtualenv/
myvenv/
venv/
Expand Down
2 changes: 1 addition & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ COPY . /usr/src/backend/

RUN ["chmod", "+x", "/usr/src/backend/entrypoint.sh"]

ENTRYPOINT ["sh", "/usr/src/backend/entrypoint.sh"]
ENTRYPOINT ["sh", "/usr/src/backend/entrypoint.sh"]
3 changes: 0 additions & 3 deletions backend/dataset/tests.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from django.test import TestCase

# Create your tests here.
89 changes: 89 additions & 0 deletions backend/dataset/validatemod.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import csv
import json
from datetime import datetime

def MultiModelInteractionValidator(file):
required_fields = {
"instance_id",
"parent_interaction_ids",
"multiple_interaction_json",
"language",
"datetime",
}
optional_fields = {
"eval_form_json",
"no_of_turns",
"no_of_models",
}
valid_languages = [
"English", "Assamese", "Bengali", "Bodo", "Dogri", "Gujarati",
"Hindi", "Kannada", "Kashmiri", "Konkani", "Maithili",
"Malayalam", "Manipuri", "Marathi", "Nepali", "Odia",
"Punjabi", "Sanskrit", "Santali", "Sindhi", "Sinhala",
"Tamil", "Telugu", "Urdu"
]

errors = []

# Open the file using the uploaded file object
reader = csv.DictReader(file.read().decode("utf-8-sig").splitlines())

for index, row in enumerate(reader, start=1):
row_fields = set(row.keys())

# Check required fields
missing_required = required_fields - row_fields
if missing_required:
errors.append(f"Row {index} missing required fields: {missing_required}")

# Check unexpected fields
unexpected_fields = row_fields - (required_fields | optional_fields)
if unexpected_fields:
errors.append(f"Row {index} has unexpected fields: {unexpected_fields}")

# Validate specific fields
if row["instance_id"] is None:
errors.append(f"Row {index} must have 'instance_id'")

# Validate parent_interaction_ids
if row["parent_interaction_ids"]:
if not (row["parent_interaction_ids"].startswith("[") and row["parent_interaction_ids"].endswith("]")):
errors.append(f"Row {index}: parent_interaction_ids should be a JSON array format")

# Validate multiple_interaction_json
try:
interactions = json.loads(row["multiple_interaction_json"])
for interaction in interactions:
if "prompt" not in interaction:
errors.append(f"Row {index}: Each interaction must contain 'prompt'")
if "prompt_output_pair_id" not in interaction:
errors.append(f"Row {index}: Each interaction must contain 'prompt_output_pair_id'")
if "model_responses_json" not in interaction:
errors.append(f"Row {index}: Each interaction must contain 'model_responses_json'")
for model_response in interaction["model_responses_json"]:
for response_key, response_value in model_response.items():
if "model_name" not in response_value:
errors.append(f"Row {index}: {response_key} should contain 'model_name'")
if "output" not in response_value:
errors.append(f"Row {index}: {response_key} should contain 'output'")
except json.JSONDecodeError:
errors.append(f"Row {index}: multiple_interaction_json must be valid JSON")

# Validate language
if row["language"] not in valid_languages:
errors.append(f"Row {index}: Invalid language '{row['language']}'")

# Validate datetime
if row["datetime"]:
try:
datetime.fromisoformat(row["datetime"])
except ValueError:
errors.append(f"Row {index}: Invalid datetime format for '{row['datetime']}'")

# Validate optional fields
for field in optional_fields:
if row[field]:
if field in ["no_of_turns", "no_of_models"] and not row[field].isdigit():
errors.append(f"Row {index}: {field} should be an integer if provided")

return errors
14 changes: 12 additions & 2 deletions backend/dataset/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from base64 import b64encode
from urllib.parse import parse_qsl

from validatemod import MultiModelInteractionValidator
from django.apps import apps
from django.db.models import Q
from django.http import StreamingHttpResponse, JsonResponse, HttpResponse
Expand Down Expand Up @@ -382,7 +382,17 @@ def upload(self, request, pk):
},
status=status.HTTP_400_BAD_REQUEST,
)


# Conditionally apply MultiModelInteractionValidator based on dataset type
if dataset_type == "MultiModelInteraction":
# Replace with the specific type identifier for MultiModelInteraction
validator = MultiModelInteractionValidator()
validation_errors = validator.validate_data(dataset_string)
if validation_errors:
return Response(
{"message": "Validation failed", "errors": validation_errors},
status=status.HTTP_400_BAD_REQUEST,
)
# Uplod the dataset to the dataset instance
upload_data_to_data_instance.delay(
pk=pk,
Expand Down
Loading