본문 바로가기
Experience/- KT AIVLE School

KT AIVLE School 4주차 정리 - 회귀, 분류 모델 선택 방법

by Yoojacha 2023. 3. 6.

미니프로젝트를 진행하면서 코드 스니펫은 매우 중요합니다. 다른 사람들은 성능을 평가하고 있을 때, 키보드를 뚝딱이고 있다면 반성하고 미리미리 복습하면서 코드 스니펫을 만듭시다! 실습을 하면서 제공해준 자료는 내가 만든 것이 아니기 때문에 소화가 안된 코드 입니다!

 

강사님께서 정말 회귀와 분류에 대해서 찰떡처럼 쉽고 이해가 잘 되게 설명을 해주셔서 19년도에 혼자 처음 배웠던 머신러닝을 왜 어려워했나 싶은 마음도 듭니다! 아무튼 아래의 코드는 회귀 문제와 분류 문제를 위한 모델 선택을 할때 반복을 돌려서 어느정도 성능이 나오는지만 체크하는 코드입니다!

정성드려서 코드를 정리한 만큼 가져가신다면.. 댓글로 반응해주시면 큰 힘이됩니다!


회귀 모듈 불러오기

from sklearn.linear_model import LinearRegression as RL
from sklearn.neighbors import KNeighborsRegressor as KNNR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR
from catboost import CatBoostRegressor as CBR

from lightgbm import plot_importance as lgbm_plot_importance
from xgboost import plot_importance as xgb_plot_importance
from catboost import Pool

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score

회귀 모델 성능 평가

models =    {
                'RL': RL(n_jobs=-1),
                'KNNR': KNNR(n_neighbors=5),
                'DTR': DTR(max_depth=5, random_state=1),
                'SVR': SVR(kernel='rbf', C=1),
                'RFR': RFR(max_depth=5, random_state=1),
                'LGBMR': LGBMR(max_depth=5, num_leaves=31, learning_rate=0.1),
                'CBR': CBR(iterations=100, cat_features=np.where(x_train.dtypes != np.float)[0], random_state=1)
            }

results = {}
r2_scores = {}

for key in models:
    model = models[key]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    mse_result = mse(y_test, y_pred)
    r2_result = r2_score(y_test, y_pred)
    r2_scores[key] = r2_result
    
    print(key, '=' * 30)
    print('mse : ', mse_result)
    print('r2_score : ', r2_result)
    print('-' * len(key) + '-' * 30)
    
    results[key] = model

1개 모델 변수 중요도 시각화

key = 'LGBMC'
tmp = pd.DataFrame({'feature_importance': results[key].feature_importances_, 'feature_names': list(x_train)}).sort_values('feature_importance', ascending=False)

plt.figure(figsize=(16, 6))
sns.barplot(x='feature_importance', y='feature_names', data = tmp)
plt.title(key)
plt.show()

여러 모델 변수 중요도 시각화

for key in results:
    if key in ['RL', 'KNNR', 'SVR']:
        continue
    tmp = pd.DataFrame({'feature_importance': results[key].feature_importances_, 'feature_names': list(x_train)}).sort_values('feature_importance', ascending=False)[:20]
    plt.figure(figsize=(16, 6))
    sns.barplot(x='feature_importance', y='feature_names', data = tmp)
    plt.title(key)
    plt.show()

모델별 r2_scores 시각화

tmp = pd.DataFrame({'r2_scores': r2_scores.values(), 'models': list(r2_scores)}).sort_values('r2_scores', ascending=False)

plt.figure(figsize=(16, 6))
sns.barplot(x='r2_scores', y='models', data = tmp)
plt.show()

분류 모듈 불러오기

from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as XGBC
from lightgbm import LGBMClassifier as LGBMC
from catboost import CatBoostClassifier as CBC

from sklearn.metrics import confusion_matrix as cmatrix
from sklearn.metrics import classification_report as creport
from sklearn.metrics import recall_score as recall
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

분류 모델 성능 평가

models = {
        'LR': LR(C=1, random_state=1, n_jobs=-1),
        'KNN': KNNC(n_neighbors=5, n_jobs=-1),
        'DTC': DTC(max_depth=5, random_state=1),
        'SVC': SVC(kernel='rbf', C=1),
        'RFC': RFC(n_estimators=10, max_depth=5, random_state=1),
        'LGBMC': LGBMC(max_depth=5, num_leaves=31, learning_rate=0.1, random_state=1),
        'CBC': CBC(iterations=100, cat_features=np.where(x_train.dtypes != np.float)[0], verbose=False, random_state=1)
        }

results = {}
acc_results = {}
recall_results = {}
y_preds = {}

for key in models:
    model = models[key]
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    results[key] = model
    y_preds[key] = y_pred
    acc_results[key] = acc(y_test, y_pred)
    recall_results[key] = recall(y_test, y_pred)

분류 모델별 confusion_matrix, classification_report 출력

#'LR', 'KNN', 'DTC', 'SVC', 'RFC', 'LGBMC', 'CBC', 
key = 'CBC' # 입력해서 검색

print(key, '=' * 40)
print('confusion_matrix\n', cmatrix(y_test, y_preds[key]))
print('classification_report\n', creport(y_test, y_preds[key]))
print('-' * 60)

분류 모델별 Accuracy score 시각화

tmp = pd.DataFrame({'accuracy_scores': acc_results.values(), 'models': list(acc_results)}).sort_values('accuracy_scores', ascending=False)

plt.figure(figsize=(16, 6))
sns.barplot(x='accuracy_scores', y='models', data = tmp)
plt.show()

분류 모델별 Recall score 시각화

tmp = pd.DataFrame({'recall_scores': recall_results.values(), 'models': list(recall_results)}).sort_values('recall_scores', ascending=False)

plt.figure(figsize=(16, 6))
sns.barplot(x='recall_scores', y='models', data = tmp)
plt.tight_layout()
plt.show()

분류 모델별 학습성능, 평가성능 출력

print('모델 : 학습성능 / 평가성능 (Accuracy)')
for key in results:
    print(f'{key:5} : {round(results[key].score(x_train, y_train), 4):6} / {round(results[key].score(x_test, y_test), 4):6}\n')

댓글