## Notebook to test WAR performances on a fully labelled dataset

In [None]:
import numpy as np
import itertools
import time
import matplotlib.pyplot as plt

import torch
import torch.optim as optim

from WAR.Models import NN_phi,NN_h_RELU
from WAR.training_and_query import WAR
from WAR.dataset_handler import myData,import_dataset,get_dataset
from WAR.Experiment_functions import *
from WAR.full_training_process import full_training,check_num_round
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

In [None]:
#choosing dataset and splitting it with the desired testset proportion
# for now dataset=
#"boston","airfoil","energy1","energy2","yacht"
#,"concrete_slump","concrete_flow","concrete_compressive",x_squared","news_popularity"

X_train,X_test,y_train,y_test=get_dataset(proportion=0.2,dataset="boston")

In [None]:
#2D PCA visualization of the data
#kmeans = KMeans(n_clusters=nb_initial_labelled_datas, init='k-means++', n_init=10).fit_predict(X_train)
pca = PCA(n_components=2)
transformed = pca.fit_transform(X=X_train)
print(f"{round(sum(pca.explained_variance_),4)*100}% variance explained")
plt.figure(figsize=(8.5, 6))
plt.scatter(x=transformed[:, 0], y=transformed[:, 1]#,c=kmeans
 )

# WAR

In [None]:
total_epoch_h=100 # number of epochs to train h each round
total_epoch_phi=100 # number of epochs to train phi each round 
num_elem_queried= int(0.02*X_train.shape[0]) # number of elem queried each round 
nb_initial_labelled_datas = int(0.02*X_train.shape[0]) #nb of labelled datas at round 0
init_method="k_mean" # how the initial data will be chosen. "random" or "k-means" 
second_query_strategy="loss_approximation" # query strategy assisting our distribution-matching criterion. "loss_approximation" or None for now
lr_h=0.001 # learning rate h 
lr_phi=0.01 # learning rate phi 
weight_decay=0.001 # L2 regularization on h

batch_size_train=len(X_train) # size of the batch during the training process #len(X_train)
num_round=500 # number of rounds
num_round=check_num_round(num_round,len(y_train),nb_initial_labelled_datas,num_elem_queried)


reset_phi=False # reset the training of phi each round or not
reset_h=False # reset the training of h each round or not

reduced=True # if true (recommended),
#the heterogeneity and representativity criteria will have the same standard deviation,
#to give them the same weight in the query process. This give us more control on our querying strategy

eta=3 # weight of the representativity criterion. if relatively low (<3) can lead WAR to query too many outliers
# cnst_t3phi>3 recommended, can be put higher if there are a lot of outliers in the data distribution 

show_losses=False # show T1 and T2 losses each rounds in a graph
show_chosen_each_round=False # show which data have been chosen each round in a 2D PCA representation of the data

dim_input=X_train.shape[1]

start=time.time()

n_pool = len(y_train)
n_test = len(y_test)
idxs_lb = np.zeros(n_pool, dtype=bool)
idxs_tmp = np.arange(n_pool)


if init_method=="random":
 # Generate the initial labeled pool
 np.random.shuffle(idxs_tmp)
 idxs_lb[idxs_tmp[:nb_initial_labelled_datas]] = True
 
elif init_method=="k_mean":
 init_indices=[]
 kmeans = KMeans(n_clusters=nb_initial_labelled_datas, init='k-means++', n_init=10).fit(X_train)
 for i in range(nb_initial_labelled_datas):
 xsc = kmeans.cluster_centers_[i]
 ind = np.argmin(((X_train - xsc) ** 2).sum(axis=1))
 init_indices.append(ind)
 idxs_lb[init_indices] = True

h=NN_h_RELU(dim_input)
opti_h = optim.Adam(h.parameters(), lr=lr_h,weight_decay=weight_decay)

phi=NN_phi(dim_input)
opti_phi = optim.Adam(phi.parameters(), lr=lr_phi,maximize=True)

strategy = WAR(X_train,y_train,X_test,y_test,idxs_lb,total_epoch_h,total_epoch_phi,batch_size_train,num_elem_queried,phi
 ,h,opti_phi,opti_h,second_query_strategy)
 
error_each_round,error_each_round_per,error_each_round_rmse,t1_descend_list,t2_ascend_list=full_training(
 strategy,num_round,show_losses,show_chosen_each_round
 ,reset_phi,reset_h,weight_decay,lr_h,lr_phi,reduced,eta)


stop=time.time()

time_execution(start,stop)

In [None]:
#plot the loss of h

plt.plot(list(itertools.chain(*t1_descend_list)),c="green")
plt.grid(True)
plt.yscale("log")
plt.title("T1 loss evolution each batch",fontsize=20)

In [None]:
#plot the loss of phi

plt.plot(np.array(list(itertools.chain(*t2_ascend_list))),c="brown")
plt.grid(True)
plt.title("T2 loss evolution each batch",fontsize=20)

In [None]:
#plot RMSE

plt.plot(error_each_round_rmse)
plt.grid(True)
plt.title("RMSE of h each rounds",fontsize=20)

In [None]:
#plot MAE

plt.plot(error_each_round)
plt.grid(True)
plt.title("mean absolute error of h each rounds",fontsize=20)

In [None]:
#plot MAPE

plt.plot(error_each_round_per)
plt.grid(True)
plt.title("mean absolute percentage error of h each rounds",fontsize=20)