Code

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import MarianMTModel

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        # Read data from file
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                source, target = line.strip().split("\t")
                self.data.append((source, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        source_sentence, target_sentence = self.data[idx]
        return source_sentence, target_sentence

# Define your dataset
dataset = CustomDataset("modified_hin.txt")

# Define translation model
model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# Define data loader
batch_size = 16
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Define number of epochs
num_epochs = 3

# Train loop
for epoch in range(num_epochs):
    for source_batch, target_batch in data_loader:
        # Manual tokenization and padding
        source_tokenized = [[ord(char) for char in source_sentence] for source_sentence in source_batch]
        target_tokenized = [[ord(char) for char in target_sentence] for target_sentence in target_batch]

        # Padding to the maximum length in the batch
        max_source_length = max(len(sentence) for sentence in source_tokenized)
        max_target_length = max(len(sentence) for sentence in target_tokenized)
        source_padded = [sentence + [0] * (max_source_length - len(sentence)) for sentence in source_tokenized]
        target_padded = [sentence + [0] * (max_target_length - len(sentence)) for sentence in target_tokenized]

        # Forward pass
        source_inputs = torch.tensor(source_padded)
        target_inputs = torch.tensor(target_padded)

        outputs = model(input_ids=source_inputs, decoder_input_ids=target_inputs[:, :-1])

        # Compute loss
        loss = loss_fn(outputs.logits.view(-1, outputs.logits.shape[-1]), target_inputs[:, 1:].reshape(-1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print loss
        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Translation function
def translate_sentence(sentence):
    # Tokenize the input sentence manually
    input_ids = torch.tensor([[ord(char) for char in sentence]])     

    # Generate translation
    translation_ids = model.generate(input_ids=input_ids, max_length=100, num_beams=4, early_stopping=True)

    # Decode the translation
    translated_sentence = "".join([chr(token_id) for token_id in translation_ids[0].tolist()])

    return translated_sentence

# Example usage
english_sentence = "Hello!"
translated_sentence = translate_sentence(english_sentence)
print(f"English: {english_sentence}")
print(f"Hindi: {translated_sentence}")