-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCode
86 lines (66 loc) · 2.99 KB
/
Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel
# Define a custom dataset class
class CustomDataset(Dataset):
def __init__(self, file_path):
self.data = []
# Read data from file
with open(file_path, "r", encoding="utf-8") as file:
for line in file:
source, target = line.strip().split("\t")
self.data.append((source, target))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
source_sentence, target_sentence = self.data[idx]
return source_sentence, target_sentence
# Define your dataset
dataset = CustomDataset("modified_hin.txt")
# Define translation model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
# Define data loader
batch_size = 16
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
# Define number of epochs
num_epochs = 3
# Train loop
for epoch in range(num_epochs):
for source_batch, target_batch in data_loader:
# Manual tokenization and padding
source_tokenized = [[ord(char) for char in source_sentence] for source_sentence in source_batch]
target_tokenized = [[ord(char) for char in target_sentence] for target_sentence in target_batch]
# Padding to the maximum length in the batch
max_source_length = max(len(sentence) for sentence in source_tokenized)
max_target_length = max(len(sentence) for sentence in target_tokenized)
source_padded = [sentence + [0] * (max_source_length - len(sentence)) for sentence in source_tokenized]
target_padded = [sentence + [0] * (max_target_length - len(sentence)) for sentence in target_tokenized]
# Forward pass
source_inputs = torch.tensor(source_padded)
target_inputs = torch.tensor(target_padded)
outputs = model(input_ids=source_inputs, decoder_input_ids=target_inputs[:, :-1])
# Compute loss
loss = loss_fn(outputs.logits.view(-1, outputs.logits.shape[-1]), target_inputs[:, 1:].reshape(-1))
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Print loss
print(f"Epoch {epoch + 1}, Loss: {loss.item()}")
# Translation function
def translate_sentence(sentence):
# Tokenize the input sentence manually
input_ids = torch.tensor([[ord(char) for char in sentence]])
# Generate translation
translation_ids = model.generate(input_ids=input_ids, max_length=100, num_beams=4, early_stopping=True)
# Decode the translation
translated_sentence = "".join([chr(token_id) for token_id in translation_ids[0].tolist()])
return translated_sentence
# Example usage
english_sentence = "Hello!"
translated_sentence = translate_sentence(english_sentence)
print(f"English: {english_sentence}")
print(f"Hindi: {translated_sentence}")