The model rmihaylov/bert-base-bg fine-tuned on a Bulgarian subset of wikiann .
Import the libraries:
from typing import List, Dict import torch from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
Firstly, you'll have to define these methods, since we are using a subword Tokenizer:
def predict( text: str, model: torch.nn.Module, tokenizer: AutoTokenizer, labels_tags={ 0: "O", 1: "B-PER", 2: "I-PER", 3: "B-ORG", 4: "I-ORG", 5: "B-LOC", 6: "I-LOC" }) -> List[Dict[str, str]]: tokens_data = tokenizer(text) tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"]) words = subwords_to_words(tokens) input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0) attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0) out = model(input_ids, attention_mask=attention_mask).logits out = out.argmax(-1).squeeze(0).tolist() prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out] return merge_words_and_predictions(words, prediction) def subwords_to_words(tokens: List[str]) -> List[str]: out_tokens = [] curr_token = "" tags = [] for token in tokens: if token == "[SEP]": curr_token = curr_token.replace("▁", "") out_tokens.append(curr_token) out_tokens.append("[SEP]") break if "▁" in token and curr_token == "": curr_token += token elif "▁" in token and curr_token != "": curr_token = curr_token.replace("▁", "") out_tokens.append(curr_token) curr_token = "" curr_token += token elif "▁" not in token: curr_token += token return out_tokens def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]: result = [] curr_word = [] for i, (word, entity) in enumerate(zip(words[1:], entities[1:])): if "B-" in entity: if curr_word: curr_word = " ".join(curr_word) result.append({ "word": curr_word, "entity_group": entities[i][2:] }) curr_word = [word] else: curr_word.append(word) if "I-" in entity: curr_word.append(word) if "O" == entity: if curr_word: curr_word = " ".join(curr_word) result.append({ "word": curr_word, "entity_group": entities[i][2:] }) curr_word = [] return result
Then, you should initialize the AutoTokenizer and AutoModelForTokenClassification objects:
MODEL_ID = "auhide/bert-bg-ner" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
Finally, you can call the predict() method from above like that:
text = "Барух Спиноза е роден в Амстердам" print(f"Input: {text}") print("NERs:", predict(text, model=model, tokenizer=tokenizer))
Input: Барух Спиноза е роден в Амстердам NERs: [{'word': 'Барух Спиноза', 'entity_group': 'PER'}, {'word': 'Амстердам', 'entity_group': 'LOC'}]
Note: There are three types of entities - PER , ORG , LOC .