1. Credit 데이터셋

 

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

credit_df = pd.read_csv('/content/drive/MyDrive/KDT v2/머신러닝과 딥러닝/ data/credit.csv')
credit_df

 

# 컬럼의 최대치를 50으로 설정
pd.set_option('display.max_columns', 50)

 

# 정보 확인
credit_df.info()

#  #   Column                    Non-Null Count  Dtype  
# ---  ------                    --------------  -----  
#  0   ID                        12500 non-null  object 
#  1   Customer_ID               12500 non-null  object 
#  2   Name                      11273 non-null  object 
#  3   Age                       12500 non-null  object 
#  4   SSN                       12500 non-null  object 
#  5   Occupation                12500 non-null  object 
#  6   Annual_Income             12500 non-null  object 
#  7   Num_Bank_Accounts         12500 non-null  int64  
#  8   Num_Credit_Card           12500 non-null  int64  
#  9   Interest_Rate             12500 non-null  int64  
#  10  Num_of_Loan               12500 non-null  object 
#  11  Type_of_Loan              11074 non-null  object 
#  12  Delay_from_due_date       12500 non-null  int64  
#  13  Num_of_Delayed_Payment    11657 non-null  object 
#  14  Num_Credit_Inquiries      12264 non-null  float64
#  15  Outstanding_Debt          12500 non-null  object 
#  16  Credit_Utilization_Ratio  12500 non-null  float64
#  17  Credit_History_Age        11387 non-null  object 
#  18  Payment_of_Min_Amount     12500 non-null  object 
#  19  Total_EMI_per_month       12500 non-null  float64
#  20  Amount_invested_monthly   11935 non-null  object 
#  21  Payment_Behaviour         12500 non-null  object 
#  22  Monthly_Balance           12366 non-null  float64
#  23  Credit_Score              12500 non-null  object

한/영 변환

English 한글
ID 고유 식별자
Customer_ID 고객 ID
Name 이름
Age 나이
SSN 주민등록번호
Occupation 직업
Annual_Income 연간 소득
Num_Bank_Accounts 은행 계좌 수
Num_Credit_Card 신용 카드 수
Interest_Rate 이자율
Num_of_Loan 대출 수
Type_of_Loan 대출 유형
Delay_from_due_date 마감일로부터 연체 기간
Num_of_Delayed_Payment 연체된 결제 수
Num_Credit_Inquiries 신용조회 수
Outstanding_Debt 미상환 잔금
Credit_Utilization_Ratio 신용카드 사용률
Credit_History_Age 카드 사용 기간
Payment_of_Min_Amount 리볼빙 여부
Total_EMI_per_month 월별 총 지출 금액
Amount_invested_monthly 매월 투자 금액
Payment_Behaviour 지불 행동
Monthly_Balance 월별 잔고
Credit_Score 신용 점수

 

credit_df.drop({'ID', 'Customer_ID', 'Name', 'SSN'}, axis=1, inplace=True)

credit_df['Credit_Score'].value_counts()
# 결과 => 
# Standard    6943
# Poor        3582
# Good        1975
# Name: Credit_Score, dtype: int64

 

credit_df['Credit_Score'] = credit_df['Credit_Score'].replace({'Poor':0, 'Standard':1, 'Good':2})

credit_df.describe()

 

sns.barplot(x='Payment_of_Min_Amount', y='Credit_Score', data=credit_df)

plt.figure(figsize=(20, 5))
sns.barplot(x='Occupation', y='Credit_Score', data=credit_df)

 

plt.figure(figsize=(12, 12))
sns.heatmap(credit_df.corr(), cmap='coolwarm', vmin=-1, vmax=1, annot=True)

 

for i in credit_df.columns:
    if credit_df[i].dtype == 'O':
        print(i)
        
# 결과값 => 
# Age
# Occupation
# Annual_Income
# Num_of_Loan
# Type_of_Loan
# Num_of_Delayed_Payment
# Outstanding_Debt
# Credit_History_Age
# Payment_of_Min_Amount
# Amount_invested_monthly
# Payment_Behaviour

 

# _를 제거
for i in ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Outstanding_Debt', 'Amount_invested_monthly']:
    credit_df[i] = pd.to_numeric(credit_df[i].str.replace('_', ''))

 

credit_df['Credit_History_Age'] = credit_df['Credit_History_Age'].str.replace(' Months', '')
credit_df['Credit_History_Age'] = pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[0]) * 12 + pd.to_numeric(credit_df['Credit_History_Age'].str.split(' Years and ', expand=True)[1])

 

credit_df = credit_df[credit_df['Age'] >= 0]
sns.boxplot(y=credit_df['Age'])

 

 

credit_df = credit_df[credit_df['Age'] < 110]

credit_df = credit_df[credit_df['Num_Bank_Accounts'] <= 10]

credit_df=credit_df[credit_df['Num_Credit_Card']<=20]

credit_df = credit_df[credit_df['Interest_Rate'] <= 40]

credit_df = credit_df[(credit_df['Num_of_Loan'] <= 10) & (credit_df['Num_of_Loan'] >= 0)]

credit_df = credit_df[credit_df['Delay_from_due_date'] >= 0]

credit_df = credit_df[(credit_df['Num_of_Delayed_Payment'] <= 30) & (credit_df['Num_of_Delayed_Payment'] >= 0)]

credit_df['Num_Credit_Inquiries'] = credit_df['Num_Credit_Inquiries'].fillna(0)

 

sns.displot(credit_df['Credit_History_Age'])

sns.displot(credit_df['Amount_invested_monthly'])

sns.displot(credit_df['Monthly_Balance'])

credit_df = credit_df.fillna(credit_df.median())

credit_df.isna().mean()
# 결과값 => 
# Age                         0.000000
# Occupation                  0.000000
# Annual_Income               0.000000
# Num_Bank_Accounts           0.000000
# Num_Credit_Card             0.000000
# Interest_Rate               0.000000
# Num_of_Loan                 0.000000
# Type_of_Loan                0.111056
# Delay_from_due_date         0.000000
# Num_of_Delayed_Payment      0.000000
# Num_Credit_Inquiries        0.000000
# Outstanding_Debt            0.000000
# Credit_Utilization_Ratio    0.000000
# Credit_History_Age          0.000000
# Payment_of_Min_Amount       0.000000
# Total_EMI_per_month         0.000000
# Amount_invested_monthly     0.000000
# Payment_Behaviour           0.000000
# Monthly_Balance             0.000000
# Credit_Score                0.000000

 

credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].str.replace('and ', '')

 

# No Loan이라는 필드를 생성
credit_df['Type_of_Loan'] = credit_df['Type_of_Loan'].fillna('No Loan')

type_list = set(credit_df['Type_of_Loan'].str.split(', ').sum())
type_list

# 결과값 => 
# {'Auto Loan',
#  'Credit-Builder Loan',
#  'Debt Consolidation Loan',
#  'Home Equity Loan',
#  'Mortgage Loan',
#  'No Loan',
#  'Not Specified',
#  'Payday Loan',
#  'Personal Loan',
#  'Student Loan'}

for i in type_list:
    credit_df[i] = credit_df['Type_of_Loan'].apply(lambda x: 1 if i in x else 0)
    
credit_df.drop('Type_of_Loan', axis=1, inplace=True)

 

credit_df['Occupation'].value_counts()

# 결과값 => 
# _______          674
# Lawyer           664
# Mechanic         646
# Scientist        640
# Engineer         640
# Architect        633
# Teacher          624
# Developer        621
# Entrepreneur     620
# Media_Manager    616
# Accountant       611
# Doctor           608
# Musician         607
# Journalist       606
# Manager          602
# Writer           592
# Name: Occupation, dtype: int64

credit_df['Occupation'] = credit_df['Occupation'].replace('_______', 'Unknown')

 

credit_df['Payment_Behaviour'].value_counts()
# 결과값 => 
# Low_spent_Small_value_payments      2506
# High_spent_Medium_value_payments    1794
# High_spent_Large_value_payments     1453
# Low_spent_Medium_value_payments     1376
# High_spent_Small_value_payments     1136
# Low_spent_Large_value_payments       995
# !@9#%8                               744
# Name: Payment_Behaviour, dtype: int64

credit_df['Payment_Behaviour'] = credit_df['Payment_Behaviour'].str.replace('!@9#%8', 'Unknown')

 

credit_df = pd.get_dummies(credit_df, columns={'Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour'})

 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(credit_df.drop('Credit_Score', axis=1), credit_df["Credit_Score"], test_size=0.2, random_state=2023)

X_train.shape, y_train.shape
# 결과값 => ((8003, 51), (8003,))

X_test.shape, y_test.shape
# 결과값 => ((2001, 51), (2001,))

 

lightGBM(LGBM)

  • Microsoft에서 개발한 Gradient Boosting Framework
  • 리프 중심 히스토그램 기반 알고리즘
  • GBM(Gradient Boosting Model) : 모델1을 통해 y를 예측하고, 모델2에 데이터를 넣어 y를 예측하고, 모델3에 넣어 y를 예측하는 방식
  • 작은 데이터셋에서도 높은 성능을 보이며, 특히 대용량 데이터셋에서 다른 Gradient Boosting 알고리즘보다 빠르게 학습.
  • 메모리 사용량이 상대적으로 적은편
  • 적은 데이터셋을 사용할 경우 과적합 가능성이 매우 큼(일반적으로 데이터가 10,000개 이상은 사용 해야함)
  • 조기 중단(early stopping)을 지원

리프 중심 히스토그램 기반 알고리즘

  • 트리를 균형적으로 분할하는 것이 아니라, 최대한 불균형하게 분할
  • 특성들의 분포를 히스토그램으로 나타내고, 해당 히스토그램을 이용하여 빠르게 후보 분할 기준을 선택
  • 후보 분할 기준 중에서 최적의 분할 기준을 선택하기 위해, 데이터 포인트들을 히스토그램에 올바르게 배치하고, 이용하여 최적의 분할 기준을 선택
from lightgbm import LGBMClassifier

base_model = LGBMClassifier(random_state=2023)

base_model.fit(X_train, y_train)

pred = base_model.predict(X_test)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

accuracy_score(y_test, pred)
# 결과값 => 0.7251374312843578

confusion_matrix(y_test, pred)
# 결과값 => 
# array([[409, 144,  25],
#        [154, 855, 108],
#        [  4, 115, 187]])
print(classification_report(y_test, pred))
# 결과값 => 
#               precision    recall  f1-score   support
# 
#            0       0.72      0.71      0.71       578
#            1       0.77      0.77      0.77      1117
#            2       0.58      0.61      0.60       306
# 
#     accuracy                           0.73      2001
#    macro avg       0.69      0.69      0.69      2001
# weighted avg       0.73      0.73      0.73      2001

proba = base_model.predict_proba(X_test)

# 클래스가 3개 이상이므로 그냥 (y_test, proba)만 적으면 에러가 발생한다.
# 그러므로 multi_class='ovr'옵션을 추가하여 값을 추출한다.
roc_auc_score(y_test, proba, multi_class='ovr')

# 결과값 => 0.8932634160489487