[빅데이터분석기사] 파이썬 머신러닝(ML) 기본 틀 맛보기
일단 아래의 머신러닝 전 과정을 한번 훑고, 세부적으로 공부할 것
---분석데이터 검토---
import pandas as pd
data=pd.read_csv('데이터명', encoding='utf-8')
data.head()
print(data.shape())
---특성(X)과 레이블(y) 나누기---
X1=data[['a', 'b', 'c']]
y=data[['z']]
---train, test 데이터셋 나누기---
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, stratify=y, random_state=42)
---정규화(MinMax)---
from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
scaler_minmax.fit(X_train)
X_scaled_minmax_train = scaler_minmax.transform(X_train)
X_scaled_minmax_test = scaler_minmax.transform(X_test)
---①모델 적용(로지스틱)---
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_scaled_minmax_train, y_train)
pred_train = model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train, y_train)
pred_test = model.predict(X_scaled_minmax_test)
model.score(X_scaled_minmax_test, y_test)
---②모델 적용(선형회귀)---
from sklearn.linear_model import LinearRegression
model=LinearRegression()
model.fit(X_scaled_minmax_train, y_train)
pred_train=model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train, y_train)
---③모델 적용(랜덤포레스트(분류))---
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(X_scaled_minmax_train, y_train)
pred_train=model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train, y_train)
---④모델 적용(랜덤포레스트(회귀))---
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(X_scaled_minmax_train, y_train)
pred_train=model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train, y_train)
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_train, pred_train)
print("훈련데이터 오차행렬: \n", confusion_train)
confusion_test = confusion_matrix(y_test, pred_test)
print("테스트데이터 오차행렬: \n", confusion_test)
from sklearn.metrics import calssification_report
cfreport_train = classification_report(y_train, pred_train)
print("분류예측 레포트: \n", cfreport_train)
cfreport_train = classification_report(y_test, pred_test)
print("분류예측 레포트: \n", cfreport_train)
prob_train = model.predic_proba(X_scaled_minmax_train)
y_train[['y_pred']] = pred_train
y_train[['y_prob0', 'y_prob1']] = prob_train
y_train
y_test[['y_pred']] = pred_test
y_test[['y_prob0', 'y_prob1']] = prob_test
y_test
Total_test = pd.concat([X_test, y_test], axis=1)
Total_test
Total_test.to_csv("classification_test.csv")