-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
44 lines (22 loc) · 1.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from coolname import generate_slug
#texts = pd.read_json("input/data_for_nlp_sanitized.json", lines=True)
texts = pd.read_excel("input/data_for_nlp_sanitized.xlsx")
df = pd.DataFrame()
for file in os.listdir("./files"):
if file.endswith(".jsonl"):
df = pd.concat([df, pd.read_json("./files/"+file, lines=True)])
#replace tring from image_url with empty string
df['image_url'] = df['image_url'].str.replace('AmlDatastore://workspaceblobstore/Labeling/outputs/doNotDelete/tabularDataset/conversion/UX/d01d73bf-f487-d5e9-62a4-74a3a428011a/jsonlines_row_','')
df['image_url'] = df['image_url'].str.replace('.txt','')
df["image_url"] = df["image_url"].apply(lambda x: texts["full_text"][int(x)])
test_size = 0.2
train, test = train_test_split(df, test_size=test_size, random_state=42)
dataset_name = generate_slug(2)
#create a folder with the name of the dataset
os.mkdir(f"output/{dataset_name}")
df.to_json(f"output/{dataset_name}/full-{len(df)}-labeled-{dataset_name}.jsonl", orient="records", lines=True, force_ascii=False)
train.to_json(f"output/{dataset_name}/train-%{(1- test_size)*100}-{dataset_name}.jsonl", orient="records", lines=True, force_ascii=False)
test.to_json(f"output/{dataset_name}/test-%{test_size*100}-{dataset_name}.jsonl", orient="records", lines=True, force_ascii=False)