MatthiasPi's picture
commit WAR
ffd9d26
raw
history blame
5.08 kB
import torch
from torch.utils.data import Dataset
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
def import_dataset(name):# import dataset among a list a available ones
if name=="boston":
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
y_boston=target
X_boston=data
y_boston=torch.Tensor(y_boston).view(len(y_boston),1).float()
X_boston=torch.Tensor(X_boston).float()
return X_boston,y_boston
if name=="airfoil":
columns_names=["Frequency","Angle of attack","Chord length","Free-stream velocity","Suction side displacement thickness","sound pressure level"]
airfoil=pd.read_csv('datasets/airfoil_self_noise.dat',sep='\t',names=columns_names)
y_airfoil=airfoil["sound pressure level"]
X_airfoil=airfoil.drop("sound pressure level",axis=1)
y_airfoil=torch.Tensor(y_airfoil).view(len(y_airfoil),1).float()
X_airfoil=torch.Tensor(X_airfoil.values).float()
return X_airfoil,y_airfoil
if name=="energy1":
energy=pd.read_csv('datasets/energy efficiency.csv')
y_energy=energy["Y1"]
X_energy=energy.drop(["Y2","Y1"],axis=1)
y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float()
X_energy=torch.Tensor(X_energy.values).float()
return X_energy,y_energy
if name=="energy2":# other target function
energy=pd.read_csv('datasets/energy efficiency.csv')
y_energy=energy["Y2"]
X_energy=energy.drop(["Y2","Y1"],axis=1)
y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float()
X_energy=torch.Tensor(X_energy.values).float()
return X_energy,y_energy
if name=="yacht":
yacht=pd.read_csv('datasets/yacht_hydrodynamics.data',sep=' ',header=None)
y_yacht=yacht[6]
X_yacht=yacht.drop([6],axis=1)
y_yacht=torch.Tensor(y_yacht).view(len(y_yacht),1).float()
X_yacht=torch.Tensor(X_yacht.values).float()
return X_yacht,y_yacht
if name=="concrete_slump":
concrete=pd.read_csv('datasets/slump_test.data',sep=',')
y_concrete=concrete["SLUMP(cm)"]
X_concrete=concrete.drop(["No","SLUMP(cm)","FLOW(cm)","Compressive Strength (28-day)(Mpa)"],axis=1)
y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float()
X_concrete=torch.Tensor(X_concrete.values).float()
return X_concrete,y_concrete
if name=="concrete_flow":#other target function
concrete=pd.read_csv('datasets/slump_test.data',sep=',')
y_concrete=concrete["FLOW(cm)"]
X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1)
y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float()
X_concrete=torch.Tensor(X_concrete.values).float()
return X_concrete,y_concrete
if name=="concrete_compressive":#other target function
concrete=pd.read_csv('datasets/slump_test.data',sep=',')
y_concrete=concrete["Compressive Strength (28-day)(Mpa)"]
X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1)
y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float()
X_concrete=torch.Tensor(X_concrete.values).float()
return X_concrete,y_concrete
if name=="x_squared":
data_generated=100
x_b=torch.tensor([random.random() for i in range(data_generated)])
x_carré_b=x_b.view(x_b.size()[0],1)
y_carré_b=(x_b**2 + torch.tensor([np.random.normal(loc=0,scale=0.05) for i in range(data_generated)])).view(x_b.size()[0],1)
return x_carré_b,y_carré_b
if name=="news_popularity":
news=pd.read_csv('datasets/OnlineNewsPopularity/OnlineNewsPopularity.csv')
y_news=news[" shares"]
X_news=news.drop([" shares","url"," timedelta"],axis=1)
y_news=torch.Tensor(y_news).view(len(y_news),1).float()
X_news=torch.Tensor(X_news.values).float()
return X_news,y_news
def get_dataset(proportion=0.2,dataset="boston"):# scale and process the data
scaler = MinMaxScaler()
X,y=import_dataset(dataset)
X=torch.Tensor(scaler.fit_transform(X))
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=proportion)
print(f"Shape of the training set: {X_train.shape}")
return X_train,X_test,y_train,y_test
class myData(Dataset):
def __init__(self,x,y):
self.x=x
self.y=y
self.shape=x.size(0)
def __getitem__(self,index):
return self.x[index],self.y[index]
def __len__(self):
return self.shape