Opus100
from datasets import load_dataset
load_dataset("tdtunlp/opus100_envi")
-
Format for Translation task
def preprocess(
sample,
instruction_key="### Instruction:",
input_key="Input:",
response_key="<|endofprompt|>",
end_key="<|endoftext|>",
en2vi=True,
):
if en2vi:
if random.random() < 0.5:
instruction = "Translate the following sentences from English into Vietnamese."
else:
instruction = "Dịch các câu sau từ tiếng Anh sang tiếng Việt."
input = sample['en'].strip()
response = sample['vi'].strip()
else:
if random.random() < 0.5:
instruction = "Translate the following sentences from Vietnamese into English."
else:
instruction = "Dịch các câu sau từ tiếng Việt sang tiếng Anh."
input = sample['vi'].strip()
response = sample['en'].strip()
return {'text': """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
{instruction_key}
{instruction}
{input_key}
{input}
{response_key}
{response}
{end_key}""".format(
instruction_key=instruction_key,
instruction=instruction,
input_key=input_key,
input=input,
response_key=response_key,
response=response,
end_key=end_key,
)}
"""
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
Dịch các câu sau từ tiếng Anh sang tiếng Việt.
Input:
Toast falls jelly-side down, children hit tables and people get hurt.
<|endofprompt|>
Bánh mì nướng rơi đông lại, trẻ con va vào bàn và con người bị thương.
<|endoftext|>
"""