{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Notebook to test WAR performances on a fully labelled dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import itertools\n",
    "import time\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import torch\n",
    "import torch.optim as optim\n",
    "\n",
    "from WAR.Models import NN_phi,NN_h_RELU\n",
    "from WAR.training_and_query import WAR\n",
    "from WAR.dataset_handler import myData,import_dataset,get_dataset\n",
    "from WAR.Experiment_functions import *\n",
    "from WAR.full_training_process import full_training,check_num_round\n",
    "from sklearn.cluster import KMeans\n",
    "\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "print(f\"Using {device} device\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#choosing dataset and splitting it with the desired testset proportion\n",
    "# for now dataset=\n",
    "#\"boston\",\"airfoil\",\"energy1\",\"energy2\",\"yacht\"\n",
    "#,\"concrete_slump\",\"concrete_flow\",\"concrete_compressive\",x_squared\",\"news_popularity\"\n",
    "\n",
    "X_train,X_test,y_train,y_test=get_dataset(proportion=0.2,dataset=\"boston\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#2D PCA visualization of the data\n",
    "#kmeans = KMeans(n_clusters=nb_initial_labelled_datas, init='k-means++', n_init=10).fit_predict(X_train)\n",
    "pca = PCA(n_components=2)\n",
    "transformed = pca.fit_transform(X=X_train)\n",
    "print(f\"{round(sum(pca.explained_variance_),4)*100}% variance explained\")\n",
    "plt.figure(figsize=(8.5, 6))\n",
    "plt.scatter(x=transformed[:, 0], y=transformed[:, 1]#,c=kmeans\n",
    "           )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# WAR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "total_epoch_h=100 # number of epochs to train h each round\n",
    "total_epoch_phi=100 # number of epochs to train phi each round \n",
    "num_elem_queried= int(0.02*X_train.shape[0]) # number of elem queried each round  \n",
    "nb_initial_labelled_datas = int(0.02*X_train.shape[0]) #nb of labelled datas at round 0\n",
    "init_method=\"k_mean\" # how the initial data will be chosen. \"random\" or \"k-means\" \n",
    "second_query_strategy=\"loss_approximation\" # query strategy assisting our distribution-matching criterion. \"loss_approximation\" or None for now\n",
    "lr_h=0.001  # learning rate h \n",
    "lr_phi=0.01  # learning rate phi  \n",
    "weight_decay=0.001 # L2 regularization on h\n",
    "\n",
    "batch_size_train=len(X_train) # size of the batch during the training process  #len(X_train)\n",
    "num_round=500  # number of rounds\n",
    "num_round=check_num_round(num_round,len(y_train),nb_initial_labelled_datas,num_elem_queried)\n",
    "\n",
    "\n",
    "reset_phi=False  # reset the training of phi each round or not\n",
    "reset_h=False  # reset the training of h each round or not\n",
    "\n",
    "reduced=True  # if true (recommended),\n",
    "#the heterogeneity and representativity criteria will have the same standard deviation,\n",
    "#to give them the same weight in the query process. This give us more control on our querying strategy\n",
    "\n",
    "eta=3   # weight of the representativity criterion. if relatively low (<3) can lead WAR to query too many outliers\n",
    "# cnst_t3phi>3 recommended, can be put higher if there are a lot of outliers in the data distribution \n",
    "\n",
    "show_losses=False   # show T1 and T2 losses each rounds in a graph\n",
    "show_chosen_each_round=False   # show which data have been chosen each round in a 2D PCA representation of the data\n",
    "\n",
    "dim_input=X_train.shape[1]\n",
    "\n",
    "start=time.time()\n",
    "\n",
    "n_pool = len(y_train)\n",
    "n_test = len(y_test)\n",
    "idxs_lb = np.zeros(n_pool, dtype=bool)\n",
    "idxs_tmp = np.arange(n_pool)\n",
    "\n",
    "\n",
    "if init_method==\"random\":\n",
    "    # Generate the initial labeled pool\n",
    "    np.random.shuffle(idxs_tmp)\n",
    "    idxs_lb[idxs_tmp[:nb_initial_labelled_datas]] = True\n",
    "    \n",
    "elif init_method==\"k_mean\":\n",
    "    init_indices=[]\n",
    "    kmeans = KMeans(n_clusters=nb_initial_labelled_datas, init='k-means++', n_init=10).fit(X_train)\n",
    "    for i in range(nb_initial_labelled_datas):\n",
    "        xsc = kmeans.cluster_centers_[i]\n",
    "        ind = np.argmin(((X_train - xsc) ** 2).sum(axis=1))\n",
    "        init_indices.append(ind)\n",
    "    idxs_lb[init_indices] = True\n",
    "\n",
    "h=NN_h_RELU(dim_input)\n",
    "opti_h = optim.Adam(h.parameters(), lr=lr_h,weight_decay=weight_decay)\n",
    "\n",
    "phi=NN_phi(dim_input)\n",
    "opti_phi = optim.Adam(phi.parameters(), lr=lr_phi,maximize=True)\n",
    "\n",
    "strategy = WAR(X_train,y_train,X_test,y_test,idxs_lb,total_epoch_h,total_epoch_phi,batch_size_train,num_elem_queried,phi\n",
    "                ,h,opti_phi,opti_h,second_query_strategy)\n",
    "    \n",
    "error_each_round,error_each_round_per,error_each_round_rmse,t1_descend_list,t2_ascend_list=full_training(\n",
    "    strategy,num_round,show_losses,show_chosen_each_round\n",
    "    ,reset_phi,reset_h,weight_decay,lr_h,lr_phi,reduced,eta)\n",
    "\n",
    "\n",
    "stop=time.time()\n",
    "\n",
    "time_execution(start,stop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#plot the loss of h\n",
    "\n",
    "plt.plot(list(itertools.chain(*t1_descend_list)),c=\"green\")\n",
    "plt.grid(True)\n",
    "plt.yscale(\"log\")\n",
    "plt.title(\"T1 loss evolution each batch\",fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#plot the loss of phi\n",
    "\n",
    "plt.plot(np.array(list(itertools.chain(*t2_ascend_list))),c=\"brown\")\n",
    "plt.grid(True)\n",
    "plt.title(\"T2 loss evolution each batch\",fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#plot RMSE\n",
    "\n",
    "plt.plot(error_each_round_rmse)\n",
    "plt.grid(True)\n",
    "plt.title(\"RMSE of h each rounds\",fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#plot MAE\n",
    "\n",
    "plt.plot(error_each_round)\n",
    "plt.grid(True)\n",
    "plt.title(\"mean absolute error of h each rounds\",fontsize=20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#plot MAPE\n",
    "\n",
    "plt.plot(error_each_round_per)\n",
    "plt.grid(True)\n",
    "plt.title(\"mean absolute percentage error of h each rounds\",fontsize=20)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}