数据集:
gorkemgoknar/tr_ted_talk_translated
adding processing steps here if you want another language
#using Turkish as target target_lang="tr" # change to your target lang from datasets import load_dataset #ted-multi is a multiple language translated dataset #fits for our case , not to big and curated but need a simple processing dataset = load_dataset("ted_multi") dataset.cleanup_cache_files() #original from patrick's #chars_to_ignore_regex = '[,?.!\-\;\:\"“%‘”�—’…–]' # change to the ignored characters of your fine-tuned model #will use cahya/wav2vec2-base-turkish-artificial-cv #checking inside model repository to find which chars removed (no run.sh) chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\‘\”\'\`…\’»«]' import re def extract_target_lang_entries(batch): #specific mapping for ted_multi dataset #need to find index of language in each translation as it can shift try: target_index_for_lang= batch["translations"]["language"].index(target_lang) except ValueError: #target not in list empty it for later processing batch["text"] = None return batch #index_translation_pairs = zip(batch, target_index_for_batch) text= batch["translations"]["translation"][target_index_for_lang] batch["text"] = re.sub(chars_to_ignore_regex, "", text.lower()) return batch #this dataset has additional columns need to say it cols_to_remove = ['translations', 'talk_name'] dataset = dataset.map(extract_target_lang_entries, remove_columns=cols_to_remove) #on preocessing we tagged None for empty ones dataset_cleaned = dataset.filter(lambda x: x['text'] is not None) dataset_cleaned from huggingface_hub import notebook_login notebook_login() dataset_cleaned.push_to_hub(f"{target_lang}_ted_talk_translated")