In [31]:
import torch
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 45

In [32]:
tag2idx = {'X': 0,
 'YM': 1,
 '[CLS]': 2,
 'DUM': 3,
 'VBF': 4,
 'RP': 5,
 'VBKO': 6,
 'CS': 7,
 'VBX': 8,
 'VBNE': 9,
 'CC': 10,
 'Unknown': 11,
 'PKO': 12,
 'JJM': 13,
 'PLE': 14,
 'VBO': 15,
 'HRU': 16,
 'YF': 17,
 'NN': 18,
 'YQ': 19,
 'VBI': 20,
 '[SEP]': 21,
 'JJ': 22,
 'POP': 23,
 'PLAI': 24,
 'RBO': 25,
 'PP': 26,
 'CD': 27,
 'NNP': 28}

# Mapping index to name
tag2name={tag2idx[key] : key for key in tag2idx.keys()}


In [33]:
tag_2_nees = {'NN': 'Noun',
'JJ': 'Normal/Unmarked Adjective', 
'NNP': 'Noun Plural',
'POP': 'Other Postpositions',
'PKO': 'Ko-Postpositions', 
'YF': 'Sentence-final Punctuation',
'CD': 'Cardinal Digits',
'PLE':'Postpositions(Le- postpositions)',
'VBF': 'Finite Verb', 
'HRU': 'Plural Marker',
'YM': 'Sentence-medial punctuation',
'VBX': 'Auxiliary Verb',
'VBKO': 'Verb aspectual participle',
'CC': 'Coordinating conjunction',
 'DUM':'Pronoun unmarked demonstrative',
 'VBNE': 'Verb(Prospective participle)',
 'VBO':'Other participle verb',
'PLAI': 'Postpositions(Lai-Postpositions)',
 'RBO': 'Adverb(Other Adverb)',
 'VBI': 'Verb Infinitive',
 'YQ': 'Quotation Marks',
 'PP':'Possessive pronoun',
 'JJM': 'Marked adjective',
 'CS': 'Subordinating conjunction appearing before/after the clause it subordinates',
 'RP': 'Particle'}

In [34]:
# ! pip install transformers


In [35]:
from transformers import BertForMaskedLM
from transformers import BertTokenizer
model = BertForMaskedLM.from_pretrained('./models/bert_out_model/en09',
 num_labels=len(tag2idx),
 output_attentions = False,
 output_hidden_states = False
 )
vocab_file_dir = './models/bert_out_model/en09' 
tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,
 strip_accents=False,
 clean_text=False )

In [36]:
def Get_POS(test_query):
 tokenized_texts = []
 temp_token = []
 # Add [CLS] at the front 
 temp_token.append('[CLS]')
 token_list = tokenizer.tokenize(test_query)
 for m,token in enumerate(token_list):
 temp_token.append(token)
 # Trim the token to fit the length requirement
 if len(temp_token) > max_len-1:
 temp_token= temp_token[:max_len-1]
 # Add [SEP] at the end
 temp_token.append('[SEP]')
 tokenized_texts.append(temp_token)
 # Make text token into id
 input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
 maxlen=max_len, dtype="long", truncating="post", padding="post")
 # print(input_ids[0])
 
 # For fine tune of predict, with token mask is 1,pad token is 0
 attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
 attention_masks[0];
 segment_ids = [[0] * len(input_id) for input_id in input_ids]
 segment_ids[0];
 input_ids = torch.tensor(input_ids)
 attention_masks = torch.tensor(attention_masks)
 segment_ids = torch.tensor(segment_ids)
 # Set save model to Evalue loop
 model.eval();
 # Get model predict result
 with torch.no_grad():
 outputs = model(input_ids, token_type_ids=None,
 attention_mask=None,)
 # For eval mode, the first result of outputs is logits
 logits = outputs[0]
 
 # Make logits into numpy type predict result
 # The predict result contain each token's all tags predict result
 predict_results = logits.detach().cpu().numpy()

 predict_results.shape

 from scipy.special import softmax

 result_arrays_soft = softmax(predict_results[0])

 result_array = result_arrays_soft

 # Get each token final predict tag index result
 result_list = np.argmax(result_array,axis=-1)

 
 x = list()
 y = list()
 new_tokens, new_labels = [], []
 for i, mark in enumerate(attention_masks[0]):
 if mark>0:
 print("Token:%s"%(temp_token[i]))
 x.append(temp_token[i])
 # print("Tag:%s"%(result_list[i]))
 print("Predict_Tag:%s"%(tag2name[result_list[i]]))
 y.append(result_list[i])
 # print("Posibility:%f"%(result_array[i][result_list[i]]))
 
 for token, label_idx in zip(x, y):
 if token.startswith("##"):
 new_tokens[-1] = new_tokens[-1] + token[2:]
 else:
 new_labels.append(tag2name[label_idx])
 new_tokens.append(token)
 
 # for token, label in zip(new_tokens, new_labels):
 # print("{} ---------------> {}".format(token, label))
 
 
 tag_names = []
 for i in new_labels[1:-1]:
 tag_names.append(
 tag_2_nees[i]
 )
 
 return new_tokens[1:-1],tag_names

In [37]:
x,y = Get_POS("हाल नेपालका विभिन्न राजनैतिक दलहरूबीच एमसीसी कार्यक्रमबारे मतैक्यता हुन नसकेका कारण आन्दोलन पनि चर्किरहेको छ।")

Token:[CLS]
Predict_Tag:[CLS]
Token:हाल
Predict_Tag:RBO
Token:नेपालका
Predict_Tag:JJ
Token:विभिन्न
Predict_Tag:JJ
Token:राजनैतिक
Predict_Tag:JJ
Token:दलहरूबीच
Predict_Tag:JJ
Token:एमसीसी
Predict_Tag:JJ
Token:कार्यक्रमबारे
Predict_Tag:NN
Token:मतैक्य
Predict_Tag:NN
Token:##ता
Predict_Tag:X
Token:हुन
Predict_Tag:VBI
Token:नसकेका
Predict_Tag:VBKO
Token:कारण
Predict_Tag:NN
Token:आन्दोलन
Predict_Tag:NN
Token:पनि
Predict_Tag:RP
Token:चर्क
Predict_Tag:VBO
Token:##िरहेको
Predict_Tag:X
Token:छ
Predict_Tag:VBX
Token:।
Predict_Tag:YF
Token:[SEP]
Predict_Tag:[SEP]


In [38]:
x,y

(['हाल',
 'नेपालका',
 'विभिन्न',
 'राजनैतिक',
 'दलहरूबीच',
 'एमसीसी',
 'कार्यक्रमबारे',
 'मतैक्यता',
 'हुन',
 'नसकेका',
 'कारण',
 'आन्दोलन',
 'पनि',
 'चर्किरहेको',
 'छ',
 '।'],
 ['Adverb(Other Adverb)',
 'Normal/Unmarked Adjective',
 'Normal/Unmarked Adjective',
 'Normal/Unmarked Adjective',
 'Normal/Unmarked Adjective',
 'Normal/Unmarked Adjective',
 'Noun',
 'Noun',
 'Verb Infinitive',
 'Verb aspectual participle',
 'Noun',
 'Noun',
 'Particle',
 'Other participle verb',
 'Auxiliary Verb',
 'Sentence-final Punctuation'])