{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Notebook to test WAR performances on a fully labelled dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "import numpy as np\n", "import itertools\n", "import time\n", "import matplotlib.pyplot as plt\n", "\n", "import torch\n", "import torch.optim as optim\n", "\n", "from WAR.Models import NN_phi,NN_h_RELU\n", "from WAR.training_and_query import WAR\n", "from WAR.dataset_handler import myData,import_dataset,get_dataset\n", "from WAR.Experiment_functions import *\n", "from WAR.full_training_process import full_training,check_num_round\n", "from sklearn.cluster import KMeans\n", "\n", "from sklearn.decomposition import PCA\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "print(f\"Using {device} device\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "#choosing dataset and splitting it with the desired testset proportion\n", "# for now dataset=\n", "#\"boston\",\"airfoil\",\"energy1\",\"energy2\",\"yacht\"\n", "#,\"concrete_slump\",\"concrete_flow\",\"concrete_compressive\",x_squared\",\"news_popularity\"\n", "\n", "X_train,X_test,y_train,y_test=get_dataset(proportion=0.2,dataset=\"boston\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#2D PCA visualization of the data\n", "#kmeans = KMeans(n_clusters=nb_initial_labelled_datas, init='k-means++', n_init=10).fit_predict(X_train)\n", "pca = PCA(n_components=2)\n", "transformed = pca.fit_transform(X=X_train)\n", "print(f\"{round(sum(pca.explained_variance_),4)*100}% variance explained\")\n", "plt.figure(figsize=(8.5, 6))\n", "plt.scatter(x=transformed[:, 0], y=transformed[:, 1]#,c=kmeans\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# WAR" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "total_epoch_h=100 # number of epochs to train h each round\n", "total_epoch_phi=100 # number of epochs to train phi each round \n", "num_elem_queried= int(0.02*X_train.shape[0]) # number of elem queried each round \n", "nb_initial_labelled_datas = int(0.02*X_train.shape[0]) #nb of labelled datas at round 0\n", "init_method=\"k_mean\" # how the initial data will be chosen. \"random\" or \"k-means\" \n", "second_query_strategy=\"loss_approximation\" # query strategy assisting our distribution-matching criterion. \"loss_approximation\" or None for now\n", "lr_h=0.001 # learning rate h \n", "lr_phi=0.01 # learning rate phi \n", "weight_decay=0.001 # L2 regularization on h\n", "\n", "batch_size_train=len(X_train) # size of the batch during the training process #len(X_train)\n", "num_round=500 # number of rounds\n", "num_round=check_num_round(num_round,len(y_train),nb_initial_labelled_datas,num_elem_queried)\n", "\n", "\n", "reset_phi=False # reset the training of phi each round or not\n", "reset_h=False # reset the training of h each round or not\n", "\n", "reduced=True # if true (recommended),\n", "#the heterogeneity and representativity criteria will have the same standard deviation,\n", "#to give them the same weight in the query process. This give us more control on our querying strategy\n", "\n", "eta=3 # weight of the representativity criterion. if relatively low (<3) can lead WAR to query too many outliers\n", "# cnst_t3phi>3 recommended, can be put higher if there are a lot of outliers in the data distribution \n", "\n", "show_losses=False # show T1 and T2 losses each rounds in a graph\n", "show_chosen_each_round=False # show which data have been chosen each round in a 2D PCA representation of the data\n", "\n", "dim_input=X_train.shape[1]\n", "\n", "start=time.time()\n", "\n", "n_pool = len(y_train)\n", "n_test = len(y_test)\n", "idxs_lb = np.zeros(n_pool, dtype=bool)\n", "idxs_tmp = np.arange(n_pool)\n", "\n", "\n", "if init_method==\"random\":\n", " # Generate the initial labeled pool\n", " np.random.shuffle(idxs_tmp)\n", " idxs_lb[idxs_tmp[:nb_initial_labelled_datas]] = True\n", " \n", "elif init_method==\"k_mean\":\n", " init_indices=[]\n", " kmeans = KMeans(n_clusters=nb_initial_labelled_datas, init='k-means++', n_init=10).fit(X_train)\n", " for i in range(nb_initial_labelled_datas):\n", " xsc = kmeans.cluster_centers_[i]\n", " ind = np.argmin(((X_train - xsc) ** 2).sum(axis=1))\n", " init_indices.append(ind)\n", " idxs_lb[init_indices] = True\n", "\n", "h=NN_h_RELU(dim_input)\n", "opti_h = optim.Adam(h.parameters(), lr=lr_h,weight_decay=weight_decay)\n", "\n", "phi=NN_phi(dim_input)\n", "opti_phi = optim.Adam(phi.parameters(), lr=lr_phi,maximize=True)\n", "\n", "strategy = WAR(X_train,y_train,X_test,y_test,idxs_lb,total_epoch_h,total_epoch_phi,batch_size_train,num_elem_queried,phi\n", " ,h,opti_phi,opti_h,second_query_strategy)\n", " \n", "error_each_round,error_each_round_per,error_each_round_rmse,t1_descend_list,t2_ascend_list=full_training(\n", " strategy,num_round,show_losses,show_chosen_each_round\n", " ,reset_phi,reset_h,weight_decay,lr_h,lr_phi,reduced,eta)\n", "\n", "\n", "stop=time.time()\n", "\n", "time_execution(start,stop)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#plot the loss of h\n", "\n", "plt.plot(list(itertools.chain(*t1_descend_list)),c=\"green\")\n", "plt.grid(True)\n", "plt.yscale(\"log\")\n", "plt.title(\"T1 loss evolution each batch\",fontsize=20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#plot the loss of phi\n", "\n", "plt.plot(np.array(list(itertools.chain(*t2_ascend_list))),c=\"brown\")\n", "plt.grid(True)\n", "plt.title(\"T2 loss evolution each batch\",fontsize=20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "#plot RMSE\n", "\n", "plt.plot(error_each_round_rmse)\n", "plt.grid(True)\n", "plt.title(\"RMSE of h each rounds\",fontsize=20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#plot MAE\n", "\n", "plt.plot(error_each_round)\n", "plt.grid(True)\n", "plt.title(\"mean absolute error of h each rounds\",fontsize=20)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#plot MAPE\n", "\n", "plt.plot(error_each_round_per)\n", "plt.grid(True)\n", "plt.title(\"mean absolute percentage error of h each rounds\",fontsize=20)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.16" } }, "nbformat": 4, "nbformat_minor": 4 }