模型:
julien-c/dummy-unknown
import json import os from transformers import RobertaConfig, RobertaForMaskedLM, TFRobertaForMaskedLM DIRNAME = "./dummy-unknown" config = RobertaConfig(10, 20, 1, 1, 40) model = RobertaForMaskedLM(config) model.save_pretrained(DIRNAME) tf_model = TFRobertaForMaskedLM.from_pretrained(DIRNAME, from_pt=True) tf_model.save_pretrained(DIRNAME) # Tokenizer: vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "\u0120", "\u0120l", "\u0120n", "\u0120lo", "\u0120low", "er", "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>", ] vocab_tokens = dict(zip(vocab, range(len(vocab)))) merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] vocab_file = os.path.join(DIRNAME, "vocab.json") merges_file = os.path.join(DIRNAME, "merges.txt") with open(vocab_file, "w", encoding="utf-8") as fp: fp.write(json.dumps(vocab_tokens) + "\n") with open(merges_file, "w", encoding="utf-8") as fp: fp.write("\n".join(merges))