-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_data.py
80 lines (67 loc) · 1.97 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, Dataset
# <<AI_ASSISTED_CODE_START>>
vocab = {
'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90,
'hundred': 100,
'thousand': 1000,
'and': 0 # 'and' can be mapped to 0 or ignored
}
df = pd.read_csv('./datasets/all_data.csv')
# Function to map words to their numerical values
def text_to_numbers(text):
tokens = text.split()
numbers = []
for token in tokens:
if token in vocab:
numbers.append(vocab[token])
else:
# Handle any unexpected tokens
numbers.append(0)
return numbers
# Apply the function to create input sequences
df['input_seq'] = df['text'].apply(text_to_numbers)
# The target outputs are the actual numbers
df['target'] = df['number'].astype(float) # Convert to float for regression
# Display the processed data
print(df[['text', 'input_seq', 'target']].head(10))
# <<AI_ASSISTED_CODE_END>>
# 1) convert input_seq to list of integers and get maximum seq length so all can be the same length
df['input_seq'] = df['input_seq'].apply(lambda x: list(map(int, x.strip('[]').split(','))))
max_len = df['input_seq'].apply(len).max()
def pad_the_sequence(seq, max_len):
return list(seq + list([0]*(max_len - len(seq))))
# alter the dataframe to include the padded sequences
df['input_seq_padded'] = df['input_seq'].apply(lambda x: pad_the_sequence(x, max_len))
df.to_csv('./datasets/processed_data.csv', index=False)
df.to_csv('processed_data.csv', index=False)