From 2c36d9e8bf83fb45ccbcf7b677ec61a9064079b7 Mon Sep 17 00:00:00 2001
From: kartikvirendrar <kartikrajput357@gmail.com>
Date: Thu, 28 Nov 2024 16:30:49 +0530
Subject: [PATCH 01/17] added new project type OCRTextlineSegmentation

---
 backend/projects/annotation_registry.py          |  9 +++++++++
 .../ocr/ocr_textline_segmentation.jsx            |  5 +++++
 backend/projects/project_registry.yaml           | 16 ++++++++++++++++
 3 files changed, 30 insertions(+)
 create mode 100644 backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx
diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index 2dddab5d3..dd4c53452 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -108,6 +108,15 @@
             "type": "labels",
         },
     },
+    "OCRTextlineSegmentation": {
+        "ocr_transcribed_json": {
+            "to_name": "image_url",
+            "from_name": [
+                "annotation_bboxes",
+            ],
+            "type": ["rectangle"],
+        },
+    },
     "OCRTranscription": {
         "ocr_transcribed_json": {
             "to_name": "image_url",
diff --git a/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx b/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx
new file mode 100644
index 000000000..7684b037b
--- /dev/null
+++ b/backend/projects/label_studio_jsx_files/ocr/ocr_textline_segmentation.jsx
@@ -0,0 +1,5 @@
+<View>
+  <Style>.ant-input { font-size: large; }</Style>
+  <Image name="image_url" value="$image_url"/>
+  <Rectangle name="annotation_bboxes" toName="image_url" strokeWidth="3" className="ignore_assertion"/>
+</View>
diff --git a/backend/projects/project_registry.yaml b/backend/projects/project_registry.yaml
index b4c0c8d85..f36c098f9 100644
--- a/backend/projects/project_registry.yaml
+++ b/backend/projects/project_registry.yaml
@@ -99,6 +99,22 @@ OCR:
         fields:
           annotations:
             - ocr_transcribed_json
+    OCRTextlineSegmentation:
+      project_mode: "Annotation"
+      label_studio_jsx_file: "ocr_textline_segmentation.jsx"
+      input_dataset:
+        class: OCRDocument
+        fields:
+          - image_url
+        display_fields:
+          - image_url
+        prediction: ocr_prediction_json
+      output_dataset: 
+        class: OCRDocument
+        save_type: in_place
+        fields:
+          annotations:
+            - ocr_transcribed_json
     OCRTranscriptionEditing:
       project_mode: "Annotation"
       label_studio_jsx_file: "ocr/ocr_transcription.jsx"

From 7c6542493bbe0f6e411101c11f9944e9372a351e Mon Sep 17 00:00:00 2001
From: Kartik Virendra Rajput
 <88619994+kartikvirendrar@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:47:01 +0530
Subject: [PATCH 02/17] Update project_registry.yaml

---
 backend/projects/project_registry.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/projects/project_registry.yaml b/backend/projects/project_registry.yaml
index f36c098f9..1420cffd9 100644
--- a/backend/projects/project_registry.yaml
+++ b/backend/projects/project_registry.yaml
@@ -101,7 +101,7 @@ OCR:
             - ocr_transcribed_json
     OCRTextlineSegmentation:
       project_mode: "Annotation"
-      label_studio_jsx_file: "ocr_textline_segmentation.jsx"
+      label_studio_jsx_file: "ocr/ocr_textline_segmentation.jsx"
       input_dataset:
         class: OCRDocument
         fields:

From 286b385a2b137d102635237084426b72719ceb4b Mon Sep 17 00:00:00 2001
From: Kunal Tiwary <kunaltiwary7@gmail.com>
Date: Wed, 4 Dec 2024 05:37:48 +0000
Subject: [PATCH 03/17] added minor changes

---
 backend/projects/project_registry.yaml | 2 ++
 backend/projects/views.py              | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/backend/projects/project_registry.yaml b/backend/projects/project_registry.yaml
index 1420cffd9..3c1e4f5eb 100644
--- a/backend/projects/project_registry.yaml
+++ b/backend/projects/project_registry.yaml
@@ -106,8 +106,10 @@ OCR:
         class: OCRDocument
         fields:
           - image_url
+          - page_number
         display_fields:
           - image_url
+          - page_number
         prediction: ocr_prediction_json
       output_dataset: 
         class: OCRDocument
diff --git a/backend/projects/views.py b/backend/projects/views.py
index 68e48c472..cb48392f9 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -917,6 +917,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
     elif (
         proj_type == "OCRTranscriptionEditing"
         or proj_type == "OCRSegmentCategorizationEditing"
+        or proj_type == "OCRTextlineSegmentation"
     ):
         data_item = OCRDocument.objects.get(pk=pk)
         ocr_prediction_json = (
@@ -2365,6 +2366,7 @@ def assign_new_tasks(self, request, pk, *args, **kwargs):
                 "AudioTranscriptionEditing",
                 "OCRTranscriptionEditing",
                 "OCRSegmentCategorizationEditing",
+                "OCRTextlineSegmentation",
             ]:
                 try:
                     result = convert_prediction_json_to_annotation_result(

From c43552cd90b4a072db7fb4ea326eded1d9a18aff Mon Sep 17 00:00:00 2001
From: Ishvinder Sethi <ishvindersethi22@gmail.com>
Date: Thu, 5 Dec 2024 10:11:23 +0530
Subject: [PATCH 04/17] Update views.py

---
 backend/projects/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/projects/views.py b/backend/projects/views.py
index 68e48c472..9e7075cab 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -1,4 +1,4 @@
-import re
+import reg
 from collections import OrderedDict
 from datetime import datetime
 from time import sleep

From c56fb8ea3b1b440fb65fac89a60a382eda68a399 Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:34:39 +0530
Subject: [PATCH 05/17] Update views.py

added support for l1 and l2 in prediction_json
---
 backend/projects/views.py | 96 +++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 44 deletions(-)

diff --git a/backend/projects/views.py b/backend/projects/views.py
index 9e7075cab..18ebbdf8c 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -1,4 +1,4 @@
-import reg
+import re
 from collections import OrderedDict
 from datetime import datetime
 from time import sleep
@@ -866,54 +866,62 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
             if isinstance(data_item.prediction_json, str)
             else data_item.prediction_json
         )
+        assert type(prediction_json) in [dict, list], "Seems something is wrong with the formatting"
+        # see if the prediction is a list, then it seems that only verbatim json is present
+        if isinstance(prediction_json,list):
+            prediction_json = {
+                    "verbatim_transcribed_json": prediction_json 
+            }
+        
         speakers_json = data_item.speakers_json
         audio_duration = data_item.audio_duration
         # converting prediction_json to result (wherever it exists) for every task.
         if prediction_json == None:
             return result
-        for idx, val in enumerate(prediction_json):
-            label_dict = {
-                "origin": "manual",
-                "to_name": "audio_url",
-                "from_name": "labels",
-                "original_length": audio_duration,
-            }
-            text_dict = {
-                "origin": "manual",
-                "to_name": "audio_url",
-                "from_name": "transcribed_json",
-                "original_length": audio_duration,
-            }
-            if proj_type == "AcousticNormalisedTranscriptionEditing":
-                text_dict["from_name"] = "verbatim_transcribed_json"
-            id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
-            label_dict["id"] = id
-            text_dict["id"] = id
-            label_dict["type"] = "labels"
-            text_dict["type"] = "textarea"
-
-            value_labels = {
-                "start": val["start"],
-                "end": val["end"],
-                "labels": [
-                    next(
-                        speaker
-                        for speaker in speakers_json
-                        if speaker["speaker_id"] == val["speaker_id"]
-                    )["name"]
-                ],
-            }
-            value_text = {
-                "start": val["start"],
-                "end": val["end"],
-                "text": [val["text"]],
-            }
-
-            label_dict["value"] = value_labels
-            text_dict["value"] = value_text
-            # mainly label_dict and text_dict are sent as result
-            result.append(label_dict)
-            result.append(text_dict)
+        for pred_type, pred_json in prediction_json.keys():
+            for idx, val in enumerate(pred_json):
+                label_dict = {
+                    "origin": "manual",
+                    "to_name": "audio_url",
+                    "from_name": "labels",
+                    "original_length": audio_duration,
+                }
+                text_dict = {
+                    "origin": "manual",
+                    "to_name": "audio_url",
+                    "from_name": "transcribed_json",
+                    "original_length": audio_duration,
+                }
+                if proj_type == "AcousticNormalisedTranscriptionEditing":
+                    text_dict["from_name"] = pred_type
+                id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
+                label_dict["id"] = id
+                text_dict["id"] = id
+                label_dict["type"] = "labels"
+                text_dict["type"] = "textarea"
+    
+                value_labels = {
+                    "start": val["start"],
+                    "end": val["end"],
+                    "labels": [
+                        next(
+                            speaker
+                            for speaker in speakers_json
+                            if speaker["speaker_id"] == val["speaker_id"]
+                        )["name"]
+                    ],
+                }
+                value_text = {
+                    "start": val["start"],
+                    "end": val["end"],
+                    "text": [val["text"]],
+                }
+    
+                label_dict["value"] = value_labels
+                text_dict["value"] = value_text
+                # mainly label_dict and text_dict are sent as result
+                result.append(label_dict)
+                result.append(text_dict)
     elif (
         proj_type == "OCRTranscriptionEditing"
         or proj_type == "OCRSegmentCategorizationEditing"

From 8f7a94f59566de97590a8aab0ad25960a76c16d3 Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:48:01 +0530
Subject: [PATCH 06/17] Update annotation_registry.py

added support for l1 and l2 in draft data json
---
 backend/projects/annotation_registry.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index 2dddab5d3..0052f569b 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -263,13 +263,21 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None)
             if field == "conversation_json":
                 ans = convert_conversation_json_to_annotation_result(value, idx)
             elif field == "transcribed_json" or field == "prediction_json":
-                ans = convert_prediction_json_to_annotation_result(
-                    value,
-                    dataset_item.speakers_json,
-                    dataset_item.audio_duration,
-                    idx,
-                    project_type == "AcousticNormalisedTranscriptionEditing",
-                )
+                assert type(value) in [list, dict], f"Something wrong is there in the type of {value}"
+                if isinstance(value,list):
+                    value = {
+                        "verbatim_transcribed_json": value
+                    }
+                for tred_type, tred_value in value.items():
+                    sub_ans = convert_prediction_json_to_annotation_result(
+                        tred_value,
+                        dataset_item.speakers_json,
+                        dataset_item.audio_duration,
+                        idx,
+                        project_type == "AcousticNormalisedTranscriptionEditing",
+                        tred_type=tred_type
+                    )
+                    ans.extend(sub_ans)
             else:
                 if field_type == "textarea":
                     field_dict["value"] = {"text": [value]}

From 9de45a06817d3dc4ee3ad4058b394013120ff730 Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:53:10 +0530
Subject: [PATCH 07/17] Update annotation_registry.py

added support for l1 and l2
---
 backend/projects/annotation_registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index 0052f569b..da1851671 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -168,7 +168,7 @@
 
 
 def convert_prediction_json_to_annotation_result(
-    prediction_json, speakers_json, audio_duration, index, is_acoustic=False
+    prediction_json, speakers_json, audio_duration, index, tred_type, is_acoustic=False
 ):
     """
     Convert prediction_json and transcribed_json to annotation_result
@@ -192,7 +192,7 @@ def convert_prediction_json_to_annotation_result(
             "original_length": audio_duration,
         }
         if is_acoustic:
-            text_dict["from_name"] = "verbatim_transcribed_json"
+            text_dict["from_name"] = tred_type
         id = f"shoonya_{index}s{idx}s{generate_random_string(13-len(str(idx)))}"
         label_dict["id"] = id
         text_dict["id"] = id

From 600b0e8e8f9728fcad11453973e0a54c270300a4 Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Thu, 5 Dec 2024 10:54:47 +0530
Subject: [PATCH 08/17] Update annotation_registry.py

---
 backend/projects/annotation_registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index da1851671..736b492ef 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -274,8 +274,8 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None)
                         dataset_item.speakers_json,
                         dataset_item.audio_duration,
                         idx,
-                        project_type == "AcousticNormalisedTranscriptionEditing",
-                        tred_type=tred_type
+                        tred_type = tred_type
+                        is_acoustic = (project_type == "AcousticNormalisedTranscriptionEditing"),
                     )
                     ans.extend(sub_ans)
             else:

From 491659d4c1a96f56c647b352078c7240ab1d754b Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:04:53 +0530
Subject: [PATCH 09/17] Update annotation_registry.py

---
 backend/projects/annotation_registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index 736b492ef..002896007 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -274,8 +274,8 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None)
                         dataset_item.speakers_json,
                         dataset_item.audio_duration,
                         idx,
-                        tred_type = tred_type
-                        is_acoustic = (project_type == "AcousticNormalisedTranscriptionEditing"),
+                        tred_type = tred_type,
+                        is_acoustic = (project_type == "AcousticNormalisedTranscriptionEditing")
                     )
                     ans.extend(sub_ans)
             else:

From af10b5a3da9e9b97feada2cfc0cf1828443d60cf Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Thu, 5 Dec 2024 14:33:45 +0530
Subject: [PATCH 10/17] Update views.py

---
 backend/projects/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/projects/views.py b/backend/projects/views.py
index 18ebbdf8c..637837638 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -878,7 +878,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
         # converting prediction_json to result (wherever it exists) for every task.
         if prediction_json == None:
             return result
-        for pred_type, pred_json in prediction_json.keys():
+        for pred_type, pred_json in prediction_json.items():
             for idx, val in enumerate(pred_json):
                 label_dict = {
                     "origin": "manual",

From 5c6b91da3b06d2d98fc981caba387513fe9f5baa Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:29:25 +0530
Subject: [PATCH 11/17] Update views.py

---
 backend/projects/views.py | 69 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 3 deletions(-)

diff --git a/backend/projects/views.py b/backend/projects/views.py
index 637837638..c2816c7d7 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -878,8 +878,71 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
         # converting prediction_json to result (wherever it exists) for every task.
         if prediction_json == None:
             return result
-        for pred_type, pred_json in prediction_json.items():
-            for idx, val in enumerate(pred_json):
+        # for pred_type, pred_json in prediction_json.items():
+        if 'acoustic_normalised_transcribed_json' in pred_json.keys():
+            for idx, val, val_acoustic in enumerate(zip(pred_json['verbatim_transcribed_json'],pred_json['acoustic_normalised_transcribed_json'])):
+                label_dict = {
+                    "origin": "manual",
+                    "to_name": "audio_url",
+                    "from_name": "labels",
+                    "original_length": audio_duration,
+                }
+                text_dict = {
+                    "origin": "manual",
+                    "to_name": "audio_url",
+                    "from_name": "transcribed_json",
+                    "original_length": audio_duration,
+                }
+                text_dict_acoustic = {
+                    "origin": "manual",
+                    "to_name": "audio_url",
+                    "from_name": "transcribed_json",
+                    "original_length": audio_duration,
+                }
+                if proj_type == "AcousticNormalisedTranscriptionEditing":
+                    text_dict["from_name"] = 'verbatim_transcribed_json'
+                    text_dict_acoustic["from_name"] = 'acoustic_normalised_transcribed_json'
+                    
+                id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
+                label_dict["id"] = id
+                text_dict["id"] = id
+                text_dict_acoustic["id"] = id
+                
+                label_dict["type"] = "labels"
+                text_dict["type"] = "textarea"
+                text_dict_acoustic["type"] = "textarea"
+    
+                value_labels = {
+                    "start": val["start"],
+                    "end": val["end"],
+                    "labels": [
+                        next(
+                            speaker
+                            for speaker in speakers_json
+                            if speaker["speaker_id"] == val["speaker_id"]
+                        )["name"]
+                    ],
+                }
+                value_text = {
+                    "start": val["start"],
+                    "end": val["end"],
+                    "text": [val["text"]],
+                }
+                value_text_acoustic = {
+                    "start": val_acoustic["start"],
+                    "end": val_acoustic["end"],
+                    "text": [val_acoustic["text"]],
+                }
+    
+                label_dict["value"] = value_labels
+                text_dict["value"] = value_text
+                text_dict_acoustic["value"] = value_text_acoustic
+                # mainly label_dict and text_dict are sent as result
+                result.append(label_dict)
+                result.append(text_dict)
+                result.append(text_dict_acoustic)
+        else:
+            for idx, val in enumerate(pred_json['verbatim_transcribed_json']):
                 label_dict = {
                     "origin": "manual",
                     "to_name": "audio_url",
@@ -893,7 +956,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                     "original_length": audio_duration,
                 }
                 if proj_type == "AcousticNormalisedTranscriptionEditing":
-                    text_dict["from_name"] = pred_type
+                    text_dict["from_name"] = 'verbatim_transcribed_json'
                 id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
                 label_dict["id"] = id
                 text_dict["id"] = id

From 068a73c3063980180fbd211f132b68b4dcf38c6c Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:35:39 +0530
Subject: [PATCH 12/17] Update views.py

---
 backend/projects/views.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/projects/views.py b/backend/projects/views.py
index c2816c7d7..59220de75 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -879,8 +879,8 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
         if prediction_json == None:
             return result
         # for pred_type, pred_json in prediction_json.items():
-        if 'acoustic_normalised_transcribed_json' in pred_json.keys():
-            for idx, val, val_acoustic in enumerate(zip(pred_json['verbatim_transcribed_json'],pred_json['acoustic_normalised_transcribed_json'])):
+        if 'acoustic_normalised_transcribed_json' in prediction_json.keys():
+            for idx, val, val_acoustic in enumerate(zip(prediction_json['verbatim_transcribed_json'],prediction_json['acoustic_normalised_transcribed_json'])):
                 label_dict = {
                     "origin": "manual",
                     "to_name": "audio_url",
@@ -942,7 +942,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                 result.append(text_dict)
                 result.append(text_dict_acoustic)
         else:
-            for idx, val in enumerate(pred_json['verbatim_transcribed_json']):
+            for idx, val in enumerate(prediction_json['verbatim_transcribed_json']):
                 label_dict = {
                     "origin": "manual",
                     "to_name": "audio_url",

From 2182c7aaff33d0c6c5fbf4db1c0bf32776757acf Mon Sep 17 00:00:00 2001
From: Tahir <68889901+tahirjmakhdoomi@users.noreply.github.com>
Date: Sat, 7 Dec 2024 11:38:56 +0530
Subject: [PATCH 13/17] Update views.py

---
 backend/projects/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/projects/views.py b/backend/projects/views.py
index 59220de75..1f6e97e4e 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -880,7 +880,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
             return result
         # for pred_type, pred_json in prediction_json.items():
         if 'acoustic_normalised_transcribed_json' in prediction_json.keys():
-            for idx, val, val_acoustic in enumerate(zip(prediction_json['verbatim_transcribed_json'],prediction_json['acoustic_normalised_transcribed_json'])):
+            for idx, (val, val_acoustic) in enumerate(zip(prediction_json['verbatim_transcribed_json'],prediction_json['acoustic_normalised_transcribed_json'])):
                 label_dict = {
                     "origin": "manual",
                     "to_name": "audio_url",

From 8855826216af94f7a4deb21c776e1322b689a649 Mon Sep 17 00:00:00 2001
From: Kunal Tiwary <kunaltiwary7@gmail.com>
Date: Sun, 8 Dec 2024 07:24:44 +0000
Subject: [PATCH 14/17] added fix for draft_data_json

---
 backend/projects/annotation_registry.py | 85 +++++--------------------
 backend/projects/views.py               | 63 ++++++++++--------
 2 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/backend/projects/annotation_registry.py b/backend/projects/annotation_registry.py
index 002896007..0e29a5b40 100644
--- a/backend/projects/annotation_registry.py
+++ b/backend/projects/annotation_registry.py
@@ -167,59 +167,6 @@
 }
 
 
-def convert_prediction_json_to_annotation_result(
-    prediction_json, speakers_json, audio_duration, index, tred_type, is_acoustic=False
-):
-    """
-    Convert prediction_json and transcribed_json to annotation_result
-    """
-
-    result = []
-    if prediction_json == None:
-        return result
-
-    for idx, val in enumerate(prediction_json):
-        label_dict = {
-            "origin": "manual",
-            "to_name": "audio_url",
-            "from_name": "labels",
-            "original_length": audio_duration,
-        }
-        text_dict = {
-            "origin": "manual",
-            "to_name": "audio_url",
-            "from_name": "transcribed_json",
-            "original_length": audio_duration,
-        }
-        if is_acoustic:
-            text_dict["from_name"] = tred_type
-        id = f"shoonya_{index}s{idx}s{generate_random_string(13-len(str(idx)))}"
-        label_dict["id"] = id
-        text_dict["id"] = id
-        label_dict["type"] = "labels"
-        text_dict["type"] = "textarea"
-
-        value_labels = {
-            "start": val["start"],
-            "end": val["end"],
-            "labels": [
-                next(
-                    speaker
-                    for speaker in speakers_json
-                    if speaker["speaker_id"] == val["speaker_id"]
-                )["name"]
-            ],
-        }
-        value_text = {"start": val["start"], "end": val["end"], "text": [val["text"]]}
-
-        label_dict["value"] = value_labels
-        text_dict["value"] = value_text
-        result.append(label_dict)
-        result.append(text_dict)
-
-    return result
-
-
 def convert_conversation_json_to_annotation_result(conversation_json, idx):
     result = []
     for i in range(len(conversation_json)):
@@ -239,12 +186,15 @@ def convert_conversation_json_to_annotation_result(conversation_json, idx):
 
 
 def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None):
+    from projects.views import convert_prediction_json_to_annotation_result
+
     registry_helper = ProjectRegistry.get_instance()
     input_dataset_info = registry_helper.get_input_dataset_and_fields(project_type)
     dataset_model = getattr(dataset_models, input_dataset_info["dataset_type"])
     try:
         dataset_item = dataset_model.objects.get(pk=pk)
     except:
+        dataset_item = None
         pass
     result = []
     idx = 0
@@ -263,21 +213,20 @@ def draft_data_json_to_annotation_result(draft_data_json, project_type, pk=None)
             if field == "conversation_json":
                 ans = convert_conversation_json_to_annotation_result(value, idx)
             elif field == "transcribed_json" or field == "prediction_json":
-                assert type(value) in [list, dict], f"Something wrong is there in the type of {value}"
-                if isinstance(value,list):
-                    value = {
-                        "verbatim_transcribed_json": value
-                    }
-                for tred_type, tred_value in value.items():
-                    sub_ans = convert_prediction_json_to_annotation_result(
-                        tred_value,
-                        dataset_item.speakers_json,
-                        dataset_item.audio_duration,
-                        idx,
-                        tred_type = tred_type,
-                        is_acoustic = (project_type == "AcousticNormalisedTranscriptionEditing")
-                    )
-                    ans.extend(sub_ans)
+                assert type(value) in [
+                    list,
+                    dict,
+                ], f"Something wrong is there in the type of {value}"
+                if isinstance(value, list):
+                    value = {"verbatim_transcribed_json": value}
+                sub_ans = convert_prediction_json_to_annotation_result(
+                    None,
+                    project_type,
+                    dataset_item,
+                    value,
+                    True,
+                )
+                ans.extend(sub_ans)
             else:
                 if field_type == "textarea":
                     field_dict["value"] = {"text": [value]}
diff --git a/backend/projects/views.py b/backend/projects/views.py
index 1f6e97e4e..039e53570 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -854,33 +854,42 @@ def get_task_count_unassigned(pk, user):
     return len(proj_tasks_unassigned)
 
 
-def convert_prediction_json_to_annotation_result(pk, proj_type):
+def convert_prediction_json_to_annotation_result(
+    pk, proj_type, data_item, prediction_json, populate_draft_data=False
+):
     result = []
     if (
         proj_type == "AudioTranscriptionEditing"
         or proj_type == "AcousticNormalisedTranscriptionEditing"
     ):
-        data_item = SpeechConversation.objects.get(pk=pk)
-        prediction_json = (
-            json.loads(data_item.prediction_json)
-            if isinstance(data_item.prediction_json, str)
-            else data_item.prediction_json
-        )
-        assert type(prediction_json) in [dict, list], "Seems something is wrong with the formatting"
+        if not data_item and not prediction_json:
+            data_item = SpeechConversation.objects.get(pk=pk)
+            prediction_json = (
+                json.loads(data_item.prediction_json)
+                if isinstance(data_item.prediction_json, str)
+                else data_item.prediction_json
+            )
+        assert type(prediction_json) in [
+            dict,
+            list,
+        ], "Seems something is wrong with the formatting"
         # see if the prediction is a list, then it seems that only verbatim json is present
-        if isinstance(prediction_json,list):
-            prediction_json = {
-                    "verbatim_transcribed_json": prediction_json 
-            }
-        
+        if isinstance(prediction_json, list):
+            prediction_json = {"verbatim_transcribed_json": prediction_json}
+
         speakers_json = data_item.speakers_json
         audio_duration = data_item.audio_duration
         # converting prediction_json to result (wherever it exists) for every task.
         if prediction_json == None:
             return result
         # for pred_type, pred_json in prediction_json.items():
-        if 'acoustic_normalised_transcribed_json' in prediction_json.keys():
-            for idx, (val, val_acoustic) in enumerate(zip(prediction_json['verbatim_transcribed_json'],prediction_json['acoustic_normalised_transcribed_json'])):
+        if "acoustic_normalised_transcribed_json" in prediction_json.keys():
+            for idx, (val, val_acoustic) in enumerate(
+                zip(
+                    prediction_json["verbatim_transcribed_json"],
+                    prediction_json["acoustic_normalised_transcribed_json"],
+                )
+            ):
                 label_dict = {
                     "origin": "manual",
                     "to_name": "audio_url",
@@ -900,18 +909,20 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                     "original_length": audio_duration,
                 }
                 if proj_type == "AcousticNormalisedTranscriptionEditing":
-                    text_dict["from_name"] = 'verbatim_transcribed_json'
-                    text_dict_acoustic["from_name"] = 'acoustic_normalised_transcribed_json'
-                    
+                    text_dict["from_name"] = "verbatim_transcribed_json"
+                    text_dict_acoustic[
+                        "from_name"
+                    ] = "acoustic_normalised_transcribed_json"
+
                 id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
                 label_dict["id"] = id
                 text_dict["id"] = id
                 text_dict_acoustic["id"] = id
-                
+
                 label_dict["type"] = "labels"
                 text_dict["type"] = "textarea"
                 text_dict_acoustic["type"] = "textarea"
-    
+
                 value_labels = {
                     "start": val["start"],
                     "end": val["end"],
@@ -933,7 +944,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                     "end": val_acoustic["end"],
                     "text": [val_acoustic["text"]],
                 }
-    
+
                 label_dict["value"] = value_labels
                 text_dict["value"] = value_text
                 text_dict_acoustic["value"] = value_text_acoustic
@@ -942,7 +953,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                 result.append(text_dict)
                 result.append(text_dict_acoustic)
         else:
-            for idx, val in enumerate(prediction_json['verbatim_transcribed_json']):
+            for idx, val in enumerate(prediction_json["verbatim_transcribed_json"]):
                 label_dict = {
                     "origin": "manual",
                     "to_name": "audio_url",
@@ -956,13 +967,13 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                     "original_length": audio_duration,
                 }
                 if proj_type == "AcousticNormalisedTranscriptionEditing":
-                    text_dict["from_name"] = 'verbatim_transcribed_json'
+                    text_dict["from_name"] = "verbatim_transcribed_json"
                 id = f"shoonya_{idx}s{generate_random_string(13 - len(str(idx)))}"
                 label_dict["id"] = id
                 text_dict["id"] = id
                 label_dict["type"] = "labels"
                 text_dict["type"] = "textarea"
-    
+
                 value_labels = {
                     "start": val["start"],
                     "end": val["end"],
@@ -979,7 +990,7 @@ def convert_prediction_json_to_annotation_result(pk, proj_type):
                     "end": val["end"],
                     "text": [val["text"]],
                 }
-    
+
                 label_dict["value"] = value_labels
                 text_dict["value"] = value_text
                 # mainly label_dict and text_dict are sent as result
@@ -2439,7 +2450,7 @@ def assign_new_tasks(self, request, pk, *args, **kwargs):
             ]:
                 try:
                     result = convert_prediction_json_to_annotation_result(
-                        task.input_data.id, project.project_type
+                        task.input_data.id, project.project_type, None, None, False
                     )
                 except Exception as e:
                     print(

From 31f79a87e02a67e3a05b670a6b3e26bfc74269e8 Mon Sep 17 00:00:00 2001
From: Kunal Tiwary <kunaltiwary7@gmail.com>
Date: Mon, 9 Dec 2024 06:41:39 +0000
Subject: [PATCH 15/17] added changes in download

---
 backend/projects/utils.py | 18 +++++++++++++++++-
 backend/projects/views.py |  5 ++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/backend/projects/utils.py b/backend/projects/utils.py
index 4987ed878..bb539a401 100644
--- a/backend/projects/utils.py
+++ b/backend/projects/utils.py
@@ -27,7 +27,7 @@
 from jiwer import wer
 
 from utils.convert_result_to_chitralekha_format import create_memory
-
+from dataset import models as dataset_models
 
 nltk.download("punkt")
 
@@ -485,6 +485,7 @@ def process_task(
     include_input_data_metadata_json,
     dataset_model,
     is_audio_project_type,
+    fetch_parent_data_field,
 ):
     task_dict = model_to_dict(task)
     if export_type != "JSON":
@@ -519,6 +520,21 @@ def process_task(
         task_dict["data"]["input_data_metadata_json"] = dataset_model.objects.get(
             pk=task_dict["input_data"]
         ).metadata_json
+    try:
+        if fetch_parent_data_field and dataset_model:
+            parent_data_item = dataset_model.objects.get(
+                pk=task_dict["input_data"]
+            ).parent_data
+            if parent_data_item:
+                dataset_model = getattr(
+                    dataset_models, parent_data_item.instance_id.dataset_type
+                )
+                parent_dataset_model = dataset_model.objects.get(pk=parent_data_item.id)
+                task_dict["data"]["fetch_parent_data_field"] = getattr(
+                    parent_dataset_model, fetch_parent_data_field, None
+                )
+    except Exception as e:
+        pass
 
     del task_dict["annotation_users"]
     del task_dict["review_user"]
diff --git a/backend/projects/views.py b/backend/projects/views.py
index 039e53570..10a75b1bf 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -4130,7 +4130,9 @@ def download(self, request, pk=None, *args, **kwargs):
         try:
             project = Project.objects.get(pk=pk)
             project_type = dict(PROJECT_TYPE_CHOICES)[project.project_type]
-
+            fetch_parent_data_field = request.query_params.get(
+                "fetch_parent_data_field", None
+            )
             include_input_data_metadata_json = request.query_params.get(
                 "include_input_data_metadata_json", False
             )
@@ -4181,6 +4183,7 @@ def download(self, request, pk=None, *args, **kwargs):
                         include_input_data_metadata_json,
                         dataset_model,
                         is_audio_project_type,
+                        fetch_parent_data_field,
                     )
                     if (
                         is_ConversationTranslation

From 8d9fd12ce9f35776541bc5482c2001c00ffb5d6d Mon Sep 17 00:00:00 2001
From: Kunal Tiwary <kunaltiwary7@gmail.com>
Date: Thu, 12 Dec 2024 06:42:03 +0000
Subject: [PATCH 16/17] added minor changes for ocr_te

---
 backend/projects/utils.py | 17 ++++++++++++++---
 backend/projects/views.py | 14 +++++++++++---
 backend/tasks/views.py    | 10 +++++++---
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/backend/projects/utils.py b/backend/projects/utils.py
index 4987ed878..2b6e091ed 100644
--- a/backend/projects/utils.py
+++ b/backend/projects/utils.py
@@ -361,7 +361,10 @@ def process_speech_tasks(task, is_audio_segmentation, project_type):
 
 
 def process_ocr_tasks(
-    task, is_OCRSegmentCategorization, is_OCRSegmentCategorizationEditing
+    task,
+    is_OCRSegmentCategorization,
+    is_OCRSegmentCategorizationEditing,
+    is_OCRTextlineSegmentation,
 ):
     annotation_result = process_annotation_result(task)
     process_ocr_results(
@@ -369,6 +372,7 @@ def process_ocr_tasks(
         annotation_result,
         is_OCRSegmentCategorization,
         is_OCRSegmentCategorizationEditing,
+        is_OCRTextlineSegmentation,
     )
 
 
@@ -451,6 +455,7 @@ def process_ocr_results(
     annotation_result,
     is_OCRSegmentCategorization,
     is_OCRSegmentCategorizationEditing,
+    is_OCRTextlineSegmentation,
 ):
     from projects.views import convert_annotation_result_to_formatted_json
 
@@ -458,10 +463,16 @@ def process_ocr_results(
         annotation_result,
         None,
         False,
-        is_OCRSegmentCategorization or is_OCRSegmentCategorizationEditing,
+        is_OCRSegmentCategorization
+        or is_OCRSegmentCategorizationEditing
+        or is_OCRTextlineSegmentation,
         False,
     )
-    if is_OCRSegmentCategorization or is_OCRSegmentCategorizationEditing:
+    if (
+        is_OCRSegmentCategorization
+        or is_OCRSegmentCategorizationEditing
+        or is_OCRTextlineSegmentation
+    ):
         bboxes_relation_json = []
         for ann in annotation_result:
             if "type" in ann and ann["type"] == "relation":
diff --git a/backend/projects/views.py b/backend/projects/views.py
index cb48392f9..4c36887c8 100644
--- a/backend/projects/views.py
+++ b/backend/projects/views.py
@@ -402,6 +402,7 @@ def get_review_reports(proj_id, userid, start_date, end_date):
             "OCRTranscription",
             "OCRSegmentCategorization",
             "OCRSegmentCategorizationEditing",
+            "OCRTextlineSegmentation",
         ]:
             result["Total Word Count"] = total_word_count
         elif proj_type in get_audio_project_types():
@@ -650,6 +651,7 @@ def get_supercheck_reports(proj_id, userid, start_date, end_date):
         "OCRTranscription",
         "OCRSegmentCategorization",
         "OCRSegmentCategorizationEditing",
+        "OCRTextlineSegmentation",
     ]:
         result["Validated Word Count"] = validated_word_count
         result["Validated With Changes Word Count"] = validated_with_changes_word_count
@@ -994,7 +996,7 @@ def convert_annotation_result_to_formatted_json(
     annotation_result,
     speakers_json,
     is_SpeechConversation,
-    is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing,
+    is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation,
     is_acoustic=False,
 ):
     transcribed_json = []
@@ -1090,14 +1092,18 @@ def convert_annotation_result_to_formatted_json(
                 acoustic_transcribed_json, ensure_ascii=False
             )
     else:
-        dicts = 2 if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing else 3
+        dicts = (
+            2
+            if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation
+            else 3
+        )
         for idx1 in range(0, len(annotation_result), dicts):
             rectangle_dict = {}
             labels_dict = {}
             text_dict = {}
             if isinstance(annotation_result[idx1], str):
                 annotation_result[idx1] = json.loads(annotation_result[idx1])
-            if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditing:
+            if is_OCRSegmentCategorizationOROCRSegmentCategorizationEditingOROCRTextlineSegmentation:
                 custom_text_dict = {"value": {"text": ""}}
                 text_dict = json.dumps(custom_text_dict, indent=2)
             for idx2 in range(idx1, idx1 + dicts):
@@ -4092,6 +4098,7 @@ def download(self, request, pk=None, *args, **kwargs):
             is_OCRSegmentCategorizationEditing = (
                 project_type == "OCRSegmentCategorizationEditing"
             )
+            is_OCRTextlineSegmentation = project_type == "OCRTextlineSegmentation"
             is_OCRSegmentCategorization = project_type == "OCRSegmentCategorization"
             for task in tasks:
                 try:
@@ -4123,6 +4130,7 @@ def download(self, request, pk=None, *args, **kwargs):
                                 curr_task,
                                 is_OCRSegmentCategorization,
                                 is_OCRSegmentCategorizationEditing,
+                                is_OCRTextlineSegmentation,
                             )
                 except Exception as e:
                     continue
diff --git a/backend/tasks/views.py b/backend/tasks/views.py
index 200fc1250..320405aa4 100644
--- a/backend/tasks/views.py
+++ b/backend/tasks/views.py
@@ -1754,13 +1754,17 @@ def partial_update(self, request, pk=None):
             == "AcousticNormalisedTranscriptionEditing"
             else False
         )
-        is_ocr_sc_or_sce = (
+        is_ocr_sc_or_sce_or_ts = (
             True
             if annotation_obj.task.project_id.project_type
-            in ["OCRSegmentCategorization", "OCRSegmentCategorizationEditing"]
+            in [
+                "OCRSegmentCategorization",
+                "OCRSegmentCategorizationEditing",
+                "OCRTextlineSegmentation",
+            ]
             else False
         )
-        if is_ocr_sc_or_sce and (
+        if is_ocr_sc_or_sce_or_ts_or_ts and (
             "language" in request.data or "ocr_domain" in request.data
         ):
             language = request.data.get("languages", [])

From dc78e7667bb55a4785baafa2db193de51b72e65f Mon Sep 17 00:00:00 2001
From: Kunal Tiwary <kunaltiwary7@gmail.com>
Date: Fri, 13 Dec 2024 07:57:38 +0000
Subject: [PATCH 17/17] small bug fix

---
 backend/tasks/views.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/tasks/views.py b/backend/tasks/views.py
index 320405aa4..2ec9107bf 100644
--- a/backend/tasks/views.py
+++ b/backend/tasks/views.py
@@ -1764,7 +1764,7 @@ def partial_update(self, request, pk=None):
             ]
             else False
         )
-        if is_ocr_sc_or_sce_or_ts_or_ts and (
+        if is_ocr_sc_or_sce_or_ts and (
             "language" in request.data or "ocr_domain" in request.data
         ):
             language = request.data.get("languages", [])