from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest =\
train_test_split(x,y,test_size=0.3,
random_state=42,shuffle=True)
樣品分割:
參考解答:
import pandas as pd
import sys
fpath = r”C:\Python\P107\BostonHousing.csv”
dataset = pd.read_csv(fpath)
#.shape = (506, 14)
headerList = dataset.columns.tolist()
cols = dataset.columns.size
# dataset.columns.size = 14
x = dataset.drop(headerList[-1],axis=1).values
y = dataset[headerList[-1]].values
“””
#x=dataset.iloc[:,0:cols-1].values
#y=dataset.iloc[:,cols-1:].values
#用iloc切片的y 資料非常像,
#但變成2D,第二維長度僅有1
#y=dataset.iloc[:,cols-1:].values.ravel()
#用ravel() 把2D降維成1D
y剛好在最後一欄比較好切片
若沒在最後一欄,
原本的drop比較好用
“””
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest =\
train_test_split(x,y,test_size=0.3,
random_state=42,shuffle=True)
print(“The shpae of training data(X axis):”,xtrain.shape)
print(“The shpae of training data(Y axis):”,ytrain.shape)
print(“The shpae of testing data(X axis):”,xtest.shape)
print(“The shpae of testing data(Y axis):”,ytest.shape)
506*0.3 = 152 (test_size=0.3)
506-152 = 354
改用iloc做切片
.ravel() 將2D降維成1D
實測random_state跟shuffle參數:
若真的需要都次取的都不一樣
random_state = None
shuffle = True
其他組合每次都會
取出一樣的samples
每次都一樣,
會比較容易debug
或判斷模型的優劣
random_state設了一樣的種子
shuffle = True也無效
每次都會取出一樣的資料
第四個組合
random_state = None
shuffle = False
沒種子,沒洗牌
為什麼每次都一樣?
from官網:
推薦hahow線上學習python: https://igrape.net/30afN
波士頓地區房價:
boston (整齊版):
boston(排版較亂): http://lib.stat.cmu.edu/datasets/boston
要如以下才能拼接回資料
import pandas as pd
import numpy as np
url = r”http://lib.stat.cmu.edu/datasets/boston”
dfBoston = pd.read_csv(url,sep= “\s+” ,skiprows=22,header=None)
#正則表示法, 一個以上的空白或Tab
x=np.hstack( [ dfBoston.values[::2,:],dfBoston.values[1::2,:2] ] )
y=dfBoston.values[1::2,2]
x:
部分y:
y.shape
推薦hahow線上學習python: https://igrape.net/30afN