{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import numpy as np\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "max_len  = 45"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "tag2idx = {'X': 0,\n",
    " 'YM': 1,\n",
    " '[CLS]': 2,\n",
    " 'DUM': 3,\n",
    " 'VBF': 4,\n",
    " 'RP': 5,\n",
    " 'VBKO': 6,\n",
    " 'CS': 7,\n",
    " 'VBX': 8,\n",
    " 'VBNE': 9,\n",
    " 'CC': 10,\n",
    " 'Unknown': 11,\n",
    " 'PKO': 12,\n",
    " 'JJM': 13,\n",
    " 'PLE': 14,\n",
    " 'VBO': 15,\n",
    " 'HRU': 16,\n",
    " 'YF': 17,\n",
    " 'NN': 18,\n",
    " 'YQ': 19,\n",
    " 'VBI': 20,\n",
    " '[SEP]': 21,\n",
    " 'JJ': 22,\n",
    " 'POP': 23,\n",
    " 'PLAI': 24,\n",
    " 'RBO': 25,\n",
    " 'PP': 26,\n",
    " 'CD': 27,\n",
    " 'NNP': 28}\n",
    "\n",
    "# Mapping index to name\n",
    "tag2name={tag2idx[key] : key for key in tag2idx.keys()}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "tag_2_nees =  {'NN': 'Noun',\n",
    "'JJ': 'Normal/Unmarked Adjective', \n",
    "'NNP': 'Noun Plural',\n",
    "'POP': 'Other Postpositions',\n",
    "'PKO': 'Ko-Postpositions', \n",
    "'YF': 'Sentence-final Punctuation',\n",
    "'CD': 'Cardinal Digits',\n",
    "'PLE':'Postpositions(Le- postpositions)',\n",
    "'VBF': 'Finite Verb', \n",
    "'HRU': 'Plural Marker',\n",
    "'YM': 'Sentence-medial punctuation',\n",
    "'VBX': 'Auxiliary Verb',\n",
    "'VBKO': 'Verb aspectual participle',\n",
    "'CC': 'Coordinating conjunction',\n",
    " 'DUM':'Pronoun unmarked demonstrative',\n",
    " 'VBNE': 'Verb(Prospective participle)',\n",
    " 'VBO':'Other participle verb',\n",
    "'PLAI': 'Postpositions(Lai-Postpositions)',\n",
    " 'RBO': 'Adverb(Other Adverb)',\n",
    " 'VBI': 'Verb Infinitive',\n",
    " 'YQ': 'Quotation Marks',\n",
    " 'PP':'Possessive pronoun',\n",
    " 'JJM': 'Marked adjective',\n",
    " 'CS': 'Subordinating conjunction appearing before/after the clause it subordinates',\n",
    " 'RP': 'Particle'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ! pip install transformers\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertForMaskedLM\n",
    "from transformers import BertTokenizer\n",
    "model = BertForMaskedLM.from_pretrained('./models/bert_out_model/en09',\n",
    "                                            num_labels=len(tag2idx),\n",
    "                                            output_attentions = False,\n",
    "                                            output_hidden_states = False\n",
    "                                       )\n",
    "vocab_file_dir = './models/bert_out_model/en09' \n",
    "tokenizer = BertTokenizer.from_pretrained(vocab_file_dir,\n",
    "                                        strip_accents=False,\n",
    "                                         clean_text=False )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def Get_POS(test_query):\n",
    "    tokenized_texts = []\n",
    "    temp_token = []\n",
    "    # Add [CLS] at the front \n",
    "    temp_token.append('[CLS]')\n",
    "    token_list = tokenizer.tokenize(test_query)\n",
    "    for m,token in enumerate(token_list):\n",
    "        temp_token.append(token)\n",
    "    # Trim the token to fit the length requirement\n",
    "    if len(temp_token) > max_len-1:\n",
    "        temp_token= temp_token[:max_len-1]\n",
    "    # Add [SEP] at the end\n",
    "    temp_token.append('[SEP]')\n",
    "    tokenized_texts.append(temp_token)\n",
    "    # Make text token into id\n",
    "    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],\n",
    "                            maxlen=max_len, dtype=\"long\", truncating=\"post\", padding=\"post\")\n",
    "    # print(input_ids[0])\n",
    "    \n",
    "    # For fine tune of predict, with token mask is 1,pad token is 0\n",
    "    attention_masks = [[int(i>0) for i in ii] for ii in input_ids]\n",
    "    attention_masks[0];\n",
    "    segment_ids = [[0] * len(input_id) for input_id in input_ids]\n",
    "    segment_ids[0];\n",
    "    input_ids = torch.tensor(input_ids)\n",
    "    attention_masks = torch.tensor(attention_masks)\n",
    "    segment_ids = torch.tensor(segment_ids)\n",
    "    # Set save model to Evalue loop\n",
    "    model.eval();\n",
    "    # Get model predict result\n",
    "    with torch.no_grad():\n",
    "            outputs = model(input_ids, token_type_ids=None,\n",
    "            attention_mask=None,)\n",
    "            # For eval mode, the first result of outputs is logits\n",
    "            logits = outputs[0]\n",
    "            \n",
    "    # Make logits into numpy type predict result\n",
    "    # The predict result contain each token's all tags predict result\n",
    "    predict_results = logits.detach().cpu().numpy()\n",
    "\n",
    "    predict_results.shape\n",
    "\n",
    "    from scipy.special import softmax\n",
    "\n",
    "    result_arrays_soft = softmax(predict_results[0])\n",
    "\n",
    "    result_array = result_arrays_soft\n",
    "\n",
    "    # Get each token final predict tag index result\n",
    "    result_list = np.argmax(result_array,axis=-1)\n",
    "\n",
    "        \n",
    "    x = list()\n",
    "    y = list()\n",
    "    new_tokens, new_labels = [], []\n",
    "    for i, mark in enumerate(attention_masks[0]):\n",
    "        if mark>0:\n",
    "            print(\"Token:%s\"%(temp_token[i]))\n",
    "            x.append(temp_token[i])\n",
    "    #         print(\"Tag:%s\"%(result_list[i]))\n",
    "            print(\"Predict_Tag:%s\"%(tag2name[result_list[i]]))\n",
    "            y.append(result_list[i])\n",
    "            # print(\"Posibility:%f\"%(result_array[i][result_list[i]]))\n",
    "    \n",
    "    for token, label_idx in zip(x, y):\n",
    "        if token.startswith(\"##\"):\n",
    "            new_tokens[-1] = new_tokens[-1] + token[2:]\n",
    "        else:\n",
    "            new_labels.append(tag2name[label_idx])\n",
    "            new_tokens.append(token)\n",
    "        \n",
    "    # for token, label in zip(new_tokens, new_labels):\n",
    "    #     print(\"{} ---------------> {}\".format(token, label))\n",
    "    \n",
    "    \n",
    "    tag_names = []\n",
    "    for i in new_labels[1:-1]:\n",
    "        tag_names.append(\n",
    "            tag_2_nees[i]\n",
    "        )\n",
    "       \n",
    "    return new_tokens[1:-1],tag_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Token:[CLS]\n",
      "Predict_Tag:[CLS]\n",
      "Token:हाल\n",
      "Predict_Tag:RBO\n",
      "Token:नेपालका\n",
      "Predict_Tag:JJ\n",
      "Token:विभिन्न\n",
      "Predict_Tag:JJ\n",
      "Token:राजनैतिक\n",
      "Predict_Tag:JJ\n",
      "Token:दलहरूबीच\n",
      "Predict_Tag:JJ\n",
      "Token:एमसीसी\n",
      "Predict_Tag:JJ\n",
      "Token:कार्यक्रमबारे\n",
      "Predict_Tag:NN\n",
      "Token:मतैक्य\n",
      "Predict_Tag:NN\n",
      "Token:##ता\n",
      "Predict_Tag:X\n",
      "Token:हुन\n",
      "Predict_Tag:VBI\n",
      "Token:नसकेका\n",
      "Predict_Tag:VBKO\n",
      "Token:कारण\n",
      "Predict_Tag:NN\n",
      "Token:आन्दोलन\n",
      "Predict_Tag:NN\n",
      "Token:पनि\n",
      "Predict_Tag:RP\n",
      "Token:चर्क\n",
      "Predict_Tag:VBO\n",
      "Token:##िरहेको\n",
      "Predict_Tag:X\n",
      "Token:छ\n",
      "Predict_Tag:VBX\n",
      "Token:।\n",
      "Predict_Tag:YF\n",
      "Token:[SEP]\n",
      "Predict_Tag:[SEP]\n"
     ]
    }
   ],
   "source": [
    "x,y = Get_POS(\"हाल नेपालका विभिन्न राजनैतिक दलहरूबीच एमसीसी कार्यक्रमबारे मतैक्यता हुन नसकेका कारण आन्दोलन पनि चर्किरहेको छ।\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(['हाल',\n",
       "  'नेपालका',\n",
       "  'विभिन्न',\n",
       "  'राजनैतिक',\n",
       "  'दलहरूबीच',\n",
       "  'एमसीसी',\n",
       "  'कार्यक्रमबारे',\n",
       "  'मतैक्यता',\n",
       "  'हुन',\n",
       "  'नसकेका',\n",
       "  'कारण',\n",
       "  'आन्दोलन',\n",
       "  'पनि',\n",
       "  'चर्किरहेको',\n",
       "  'छ',\n",
       "  '।'],\n",
       " ['Adverb(Other Adverb)',\n",
       "  'Normal/Unmarked Adjective',\n",
       "  'Normal/Unmarked Adjective',\n",
       "  'Normal/Unmarked Adjective',\n",
       "  'Normal/Unmarked Adjective',\n",
       "  'Normal/Unmarked Adjective',\n",
       "  'Noun',\n",
       "  'Noun',\n",
       "  'Verb Infinitive',\n",
       "  'Verb aspectual participle',\n",
       "  'Noun',\n",
       "  'Noun',\n",
       "  'Particle',\n",
       "  'Other participle verb',\n",
       "  'Auxiliary Verb',\n",
       "  'Sentence-final Punctuation'])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x,y"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "ca894e04cc6fd3e8c60826e0ca22793858ad83aa785622f3d49ff6f88f1ccbf8"
  },
  "kernelspec": {
   "display_name": "Python 3.7.0 64-bit ('pt3.7': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}