공부/Python Machine learning
17. 머신러닝과 선형대수
state
2022. 9. 22. 01:55
import csv
raw=[]
with open(" addresss, 'r', encoding="UTF-8") as f;
rdr = csv.reader(f)
for line in rdr:
raw.append(line)
//문자형 자료를 숫자형 자료로 변환
data = []
for i in range(1, n_row):
row=[]
for j in range(0,n_col):
val = float(raw[i][j])
row.append(val)
data.append(row)
//7번째 열 추출
temp = []
for i in range(0, n_row-1):
val = data[i][6]
temp.append(val)
전치 행렬 구하기 mla.transpose(data)
//피처 데이터 행렬 Feature
X = []
for i in range(0,n_row-1):
row = []
for j in range(0, n_col-1):
val = data[i][j]
row.append(val)
X.append(row)
//타깃 데이터 벡터 Target
y = []
for i in range(0, n_row-1):
val = data[i][10]
y.append(val)
//Feature별 평균
mean_col = []
for j in range(0,p):
sum_val = 0
for i in range(0,n):
sum_val += X[i][j]
mean_val = (1/n)*sum_val
mean_col.append(mean_val)
공분산 행렬
X - 평균 행렬 구하기 : X - Xbar
mean_mat = []
for i in range(0,n):
row = []
for j in range(0,p):
val = mean_col[j]
row.append(val)
mean_mat.append(row)
X_minus_mean = mla.subtract(X, mean_mat)
X_minus_mean_t = mla.transpose(X_minus_mean)
res = mla.matmul(X_minus_mean_t, X_minus_mean)
공분산 행렬 구하기
cov_X = []
for i in range(0, p):
row = []
for j in range(0,p):
val = (1/n)*res[i][j]
row.append(val)
cov_X.append(row)
//특이값 분해
U, S, Vt = mla.svd(cov_X)
sklean library를 활용한 machine learning algorithm
//tranining, test 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te=train_test_split(X,y,random_state=0)
//traning feature, test feature, training target, training target
//데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)
//KNN 회귀분석
from sklearn.neighbors import KNeighborsRegressor
clf_knn = KNeighborsRegressor(n_neighbors=5)
clf_knn.fit(X_tn_std, y_tn)
pred_y = clf_knn.predict(X_te_std)
//모형평가 MSE
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_te, pred_y))
--------------------------------------------------------------------------------------------------------------------------------------------------
NUMPY AND PANDAS
import pandas as pd
df = pd.read_csv(".data/~~.csv")
df.head()
//6번째 열 추출
df.iloc[:,5]
//전치 행렬 구하기
import numpy as np
df_t = np.transpose(df)
df_t
df.shape() df_t.shape()
//FEATURE TARGET DATA 분할
feature = ['X','Y','FFMC','DMC','DC','ISI','temp','RH','wind','rain']
X = df[feature]
y=df['area']
//FEATURE 목록을 리스트 형태로 저장, 해당 피처에 속하는 부분을 추출하여 feature 데이터 X로 저장
//TARGET 데이터인 area열을 y로 저장
//공분산 행렬
import numpy as np
Xt = np.transpose(X)
cov_X = np.cov(Xt)
//특이값 분해
U,S,Vt = np.linalg.svd(cov_X)
//tranining, test 데이터 분할
from sklearn.model_selection import train_test_split
X_tn, X_te, y_tn, y_te=train_test_split(X,y,random_state=0)
//traning feature, test feature, training target, training target
//데이터 표준화
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)
//KNN 회귀분석
from sklearn.neighbors import KNeighborsRegressor
clf_knn = KNeighborsRegressor(n_neighbors=5)
clf_knn.fit(X_tn_std, y_tn)
pred_y = clf_knn.predict(X_te_std)
//모형평가 MSE
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_te, pred_y))