File size: 5,076 Bytes
ffd9d26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import torch
from torch.utils.data import Dataset
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets


def import_dataset(name):# import dataset among a list a available ones 
    
    if name=="boston":
        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]
        y_boston=target
        X_boston=data
        y_boston=torch.Tensor(y_boston).view(len(y_boston),1).float()
        X_boston=torch.Tensor(X_boston).float()
        return X_boston,y_boston
    
    if name=="airfoil":
        columns_names=["Frequency","Angle of attack","Chord length","Free-stream velocity","Suction side displacement thickness","sound pressure level"]
        airfoil=pd.read_csv('datasets/airfoil_self_noise.dat',sep='\t',names=columns_names)
        y_airfoil=airfoil["sound pressure level"]
        X_airfoil=airfoil.drop("sound pressure level",axis=1)
        y_airfoil=torch.Tensor(y_airfoil).view(len(y_airfoil),1).float()
        X_airfoil=torch.Tensor(X_airfoil.values).float()
        return X_airfoil,y_airfoil
    
    if name=="energy1":
        energy=pd.read_csv('datasets/energy efficiency.csv')
        y_energy=energy["Y1"]
        X_energy=energy.drop(["Y2","Y1"],axis=1)
        y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float()
        X_energy=torch.Tensor(X_energy.values).float()
        return X_energy,y_energy
    
    if name=="energy2":# other target function
        energy=pd.read_csv('datasets/energy efficiency.csv')
        y_energy=energy["Y2"]
        X_energy=energy.drop(["Y2","Y1"],axis=1)
        y_energy=torch.Tensor(y_energy).view(len(y_energy),1).float()
        X_energy=torch.Tensor(X_energy.values).float()
        return X_energy,y_energy
    
    if name=="yacht":        
        yacht=pd.read_csv('datasets/yacht_hydrodynamics.data',sep=' ',header=None)
        y_yacht=yacht[6]
        X_yacht=yacht.drop([6],axis=1)
        y_yacht=torch.Tensor(y_yacht).view(len(y_yacht),1).float()
        X_yacht=torch.Tensor(X_yacht.values).float()
        return X_yacht,y_yacht  
    
    if name=="concrete_slump":        
        concrete=pd.read_csv('datasets/slump_test.data',sep=',')
        y_concrete=concrete["SLUMP(cm)"]
        X_concrete=concrete.drop(["No","SLUMP(cm)","FLOW(cm)","Compressive Strength (28-day)(Mpa)"],axis=1)
        y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float()
        X_concrete=torch.Tensor(X_concrete.values).float()
        return X_concrete,y_concrete 
    
    if name=="concrete_flow":#other target function
        concrete=pd.read_csv('datasets/slump_test.data',sep=',')
        y_concrete=concrete["FLOW(cm)"]
        X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1)
        y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float()
        X_concrete=torch.Tensor(X_concrete.values).float()
        return X_concrete,y_concrete 
    
    if name=="concrete_compressive":#other target function
        concrete=pd.read_csv('datasets/slump_test.data',sep=',')
        y_concrete=concrete["Compressive Strength (28-day)(Mpa)"]
        X_concrete=concrete.drop(["No","FLOW(cm)","SLUMP(cm)","Compressive Strength (28-day)(Mpa)"],axis=1)
        y_concrete=torch.Tensor(y_concrete).view(len(y_concrete),1).float()
        X_concrete=torch.Tensor(X_concrete.values).float()
        return X_concrete,y_concrete 
    if name=="x_squared":
        
        data_generated=100
        x_b=torch.tensor([random.random() for i in range(data_generated)])
        x_carré_b=x_b.view(x_b.size()[0],1)
        y_carré_b=(x_b**2 + torch.tensor([np.random.normal(loc=0,scale=0.05) for i in range(data_generated)])).view(x_b.size()[0],1)
        return x_carré_b,y_carré_b
    
    if name=="news_popularity":
        news=pd.read_csv('datasets/OnlineNewsPopularity/OnlineNewsPopularity.csv')
        y_news=news[" shares"]
        X_news=news.drop([" shares","url"," timedelta"],axis=1)
        y_news=torch.Tensor(y_news).view(len(y_news),1).float()
        X_news=torch.Tensor(X_news.values).float()
        return X_news,y_news

def get_dataset(proportion=0.2,dataset="boston"):# scale and process the data

    scaler = MinMaxScaler()
    X,y=import_dataset(dataset)
    X=torch.Tensor(scaler.fit_transform(X))
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=proportion)
    print(f"Shape of the training set: {X_train.shape}")
    return X_train,X_test,y_train,y_test



class myData(Dataset):
    
    def __init__(self,x,y):
        self.x=x
        self.y=y
        self.shape=x.size(0)
        
    def __getitem__(self,index):
        return self.x[index],self.y[index]
    
    def __len__(self):
        return self.shape