{ "cells": [ { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import numpy as np\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "max_len = 45" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "tag2idx = {'X': 0,\n", " 'YM': 1,\n", " '[CLS]': 2,\n", " 'DUM': 3,\n", " 'VBF': 4,\n", " 'RP': 5,\n", " 'VBKO': 6,\n", " 'CS': 7,\n", " 'VBX': 8,\n", " 'VBNE': 9,\n", " 'CC': 10,\n", " 'Unknown': 11,\n", " 'PKO': 12,\n", " 'JJM': 13,\n", " 'PLE': 14,\n", " 'VBO': 15,\n", " 'HRU': 16,\n", " 'YF': 17,\n", " 'NN': 18,\n", " 'YQ': 19,\n", " 'VBI': 20,\n", " '[SEP]': 21,\n", " 'JJ': 22,\n", " 'POP': 23,\n", " 'PLAI': 24,\n", " 'RBO': 25,\n", " 'PP': 26,\n", " 'CD': 27,\n", " 'NNP': 28}\n", "\n", "# Mapping index to name\n", "tag2name={tag2idx[key] : key for key in tag2idx.keys()}\n" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "tag_2_nees = {'NN': 'Noun',\n", "'JJ': 'Normal/Unmarked Adjective', \n", "'NNP': 'Noun Plural',\n", "'POP': 'Other Postpositions',\n", "'PKO': 'Ko-Postpositions', \n", "'YF': 'Sentence-final Punctuation',\n", "'CD': 'Cardinal Digits',\n", "'PLE':'Postpositions(Le- postpositions)',\n", "'VBF': 'Finite Verb', \n", "'HRU': 'Plural Marker',\n", "'YM': 'Sentence-medial punctuation',\n", "'VBX': 'Auxiliary Verb',\n", "'VBKO': 'Verb aspectual participle',\n", "'CC': 'Coordinating conjunction',\n", " 'DUM':'Pronoun unmarked demonstrative',\n", " 'VBNE': 'Verb(Prospective participle)',\n", " 'VBO':'Other participle verb',\n", "'PLAI': 'Postpositions(Lai-Postpositions)',\n", " 'RBO': 'Adverb(Other Adverb)',\n", " 'VBI': 'Verb Infinitive',\n", " 'YQ': 'Quotation Marks',\n", " 'PP':'Possessive pronoun',\n", " 'JJM': 'Marked adjective',\n", " 'CS': 'Subordinating conjunction appearing before/after the clause it subordinates',\n", " 'RP': 'Particle'}" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "# ! pip install transformers\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from transformers import BertForMaskedLM\n", "from transformers import BertTokenizer\n", "model = BertForMaskedLM.from_pretrained('./models/bert_out_model/en09',\n", " num_labels=len(tag2idx),\n", " output_attentions = False,\n", " output_hidden_states = False\n", " )\n", "vocab_file_dir = './models/bert_out_model/en09' \n", "tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,\n", " strip_accents=False,\n", " clean_text=False )" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "def Get_POS(test_query):\n", " tokenized_texts = []\n", " temp_token = []\n", " # Add [CLS] at the front \n", " temp_token.append('[CLS]')\n", " token_list = tokenizer.tokenize(test_query)\n", " for m,token in enumerate(token_list):\n", " temp_token.append(token)\n", " # Trim the token to fit the length requirement\n", " if len(temp_token) > max_len-1:\n", " temp_token= temp_token[:max_len-1]\n", " # Add [SEP] at the end\n", " temp_token.append('[SEP]')\n", " tokenized_texts.append(temp_token)\n", " # Make text token into id\n", " input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],\n", " maxlen=max_len, dtype=\"long\", truncating=\"post\", padding=\"post\")\n", " # print(input_ids[0])\n", " \n", " # For fine tune of predict, with token mask is 1,pad token is 0\n", " attention_masks = [[int(i>0) for i in ii] for ii in input_ids]\n", " attention_masks[0];\n", " segment_ids = [[0] * len(input_id) for input_id in input_ids]\n", " segment_ids[0];\n", " input_ids = torch.tensor(input_ids)\n", " attention_masks = torch.tensor(attention_masks)\n", " segment_ids = torch.tensor(segment_ids)\n", " # Set save model to Evalue loop\n", " model.eval();\n", " # Get model predict result\n", " with torch.no_grad():\n", " outputs = model(input_ids, token_type_ids=None,\n", " attention_mask=None,)\n", " # For eval mode, the first result of outputs is logits\n", " logits = outputs[0]\n", " \n", " # Make logits into numpy type predict result\n", " # The predict result contain each token's all tags predict result\n", " predict_results = logits.detach().cpu().numpy()\n", "\n", " predict_results.shape\n", "\n", " from scipy.special import softmax\n", "\n", " result_arrays_soft = softmax(predict_results[0])\n", "\n", " result_array = result_arrays_soft\n", "\n", " # Get each token final predict tag index result\n", " result_list = np.argmax(result_array,axis=-1)\n", "\n", " \n", " x = list()\n", " y = list()\n", " new_tokens, new_labels = [], []\n", " for i, mark in enumerate(attention_masks[0]):\n", " if mark>0:\n", " print(\"Token:%s\"%(temp_token[i]))\n", " x.append(temp_token[i])\n", " # print(\"Tag:%s\"%(result_list[i]))\n", " print(\"Predict_Tag:%s\"%(tag2name[result_list[i]]))\n", " y.append(result_list[i])\n", " # print(\"Posibility:%f\"%(result_array[i][result_list[i]]))\n", " \n", " for token, label_idx in zip(x, y):\n", " if token.startswith(\"##\"):\n", " new_tokens[-1] = new_tokens[-1] + token[2:]\n", " else:\n", " new_labels.append(tag2name[label_idx])\n", " new_tokens.append(token)\n", " \n", " # for token, label in zip(new_tokens, new_labels):\n", " # print(\"{} ---------------> {}\".format(token, label))\n", " \n", " \n", " tag_names = []\n", " for i in new_labels[1:-1]:\n", " tag_names.append(\n", " tag_2_nees[i]\n", " )\n", " \n", " return new_tokens[1:-1],tag_names" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Token:[CLS]\n", "Predict_Tag:[CLS]\n", "Token:हाल\n", "Predict_Tag:RBO\n", "Token:नेपालका\n", "Predict_Tag:JJ\n", "Token:विभिन्न\n", "Predict_Tag:JJ\n", "Token:राजनैतिक\n", "Predict_Tag:JJ\n", "Token:दलहरूबीच\n", "Predict_Tag:JJ\n", "Token:एमसीसी\n", "Predict_Tag:JJ\n", "Token:कार्यक्रमबारे\n", "Predict_Tag:NN\n", "Token:मतैक्य\n", "Predict_Tag:NN\n", "Token:##ता\n", "Predict_Tag:X\n", "Token:हुन\n", "Predict_Tag:VBI\n", "Token:नसकेका\n", "Predict_Tag:VBKO\n", "Token:कारण\n", "Predict_Tag:NN\n", "Token:आन्दोलन\n", "Predict_Tag:NN\n", "Token:पनि\n", "Predict_Tag:RP\n", "Token:चर्क\n", "Predict_Tag:VBO\n", "Token:##िरहेको\n", "Predict_Tag:X\n", "Token:छ\n", "Predict_Tag:VBX\n", "Token:।\n", "Predict_Tag:YF\n", "Token:[SEP]\n", "Predict_Tag:[SEP]\n" ] } ], "source": [ "x,y = Get_POS(\"हाल नेपालका विभिन्न राजनैतिक दलहरूबीच एमसीसी कार्यक्रमबारे मतैक्यता हुन नसकेका कारण आन्दोलन पनि चर्किरहेको छ।\")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(['हाल',\n", " 'नेपालका',\n", " 'विभिन्न',\n", " 'राजनैतिक',\n", " 'दलहरूबीच',\n", " 'एमसीसी',\n", " 'कार्यक्रमबारे',\n", " 'मतैक्यता',\n", " 'हुन',\n", " 'नसकेका',\n", " 'कारण',\n", " 'आन्दोलन',\n", " 'पनि',\n", " 'चर्किरहेको',\n", " 'छ',\n", " '।'],\n", " ['Adverb(Other Adverb)',\n", " 'Normal/Unmarked Adjective',\n", " 'Normal/Unmarked Adjective',\n", " 'Normal/Unmarked Adjective',\n", " 'Normal/Unmarked Adjective',\n", " 'Normal/Unmarked Adjective',\n", " 'Noun',\n", " 'Noun',\n", " 'Verb Infinitive',\n", " 'Verb aspectual participle',\n", " 'Noun',\n", " 'Noun',\n", " 'Particle',\n", " 'Other participle verb',\n", " 'Auxiliary Verb',\n", " 'Sentence-final Punctuation'])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x,y" ] } ], "metadata": { "interpreter": { "hash": "ca894e04cc6fd3e8c60826e0ca22793858ad83aa785622f3d49ff6f88f1ccbf8" }, "kernelspec": { "display_name": "Python 3.7.0 64-bit ('pt3.7': conda)", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }