19 분 소요

데이콘에서 실시했던 추천 알고리즘 경진대회에 참여해보았습니다.

범주형 feature가 많았던 데이터라서 Catboost를 이용해서 접근해 보았습니다.

아직 부족하지만 더 열심히 공부해보겠습니다!

라이브러리 불러오기

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

from sklearn.model_selection import KFold,GridSearchCV,train_test_split,StratifiedKFold
import matplotlib.pyplot as plt

import seaborn as sns
from catboost import CatBoostClassifier
%matplotlib inline

import warnings
warnings.filterwarnings(action='ignore')

# score 계산 함수

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
def get_clf_eval(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print('정확도:{}, 정밀도:{}, 재현율:{}'.format(accuracy, precision, recall))
    print(f"F1 score : {f1_score(y_test,pred)}")
train = pd.read_csv("train.csv", encoding='euc-kr')
test = pd.read_csv("test.csv", encoding='euc-kr')

전처리

  • 속성 D, H, L에 대해서 (컨텐츠 속성, 회원 선호 속성) 일치하는지 여부에 대한 컬럼을 (대,중,소,세분류) 생성
  • bool컬럼 0,1로 전환
# D, H, L 코드 추가

d_code = pd.read_csv('속성_D_코드.csv', index_col=0, encoding='euc-kr').T.to_dict()
h_code = pd.read_csv('속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('속성_L_코드.csv', index_col=0).T.to_dict()

def add_code(df, d_code, h_code, l_code):
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    return df



train = add_code(train, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)

일치하는지에 대한 컬럼 생성(D,H)

temp = pd.DataFrame()

# D에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
    psn = np.array(train[[col for col in train.columns if col.startswith(f'person_prefer_d_{i}')]])
    cnt = np.array(train[[col for col in train.columns if col.startswith('contents_attribute_d')]])
    result = (psn == cnt)
    result = pd.DataFrame(result, columns=[f'd{i}_match_yn',f'd{i}_n_match_yn',f'd{i}_s_match_yn',f'd{i}_m_match_yn',f'd{i}_l_match_yn'])
    temp = pd.concat([temp,result], axis=1)

# H에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
    psn = np.array(train[[col for col in train.columns if col.startswith(f'person_prefer_h_{i}')]])
    cnt = np.array(train[[col for col in train.columns if col.startswith('contents_attribute_h')]])
    result = (psn == cnt)
    result = pd.DataFrame(result, columns=[f'h{i}_match_yn' , f'h{i}_m_match_yn' , f'h{i}_l_match_yn'])
    temp = pd.concat([temp,result], axis=1)

train_1 = pd.concat([train,temp], axis=1)

temp = pd.DataFrame()

# D에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
    psn = np.array(test[[col for col in test.columns if col.startswith(f'person_prefer_d_{i}')]])
    cnt = np.array(test[[col for col in test.columns if col.startswith('contents_attribute_d')]])
    result = (psn == cnt)
    result = pd.DataFrame(result, columns=[f'd{i}_match_yn',f'd{i}_n_match_yn',f'd{i}_s_match_yn',f'd{i}_m_match_yn',f'd{i}_l_match_yn'])
    temp = pd.concat([temp,result], axis=1)

# H에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
    psn = np.array(test[[col for col in test.columns if col.startswith(f'person_prefer_h_{i}')]])
    cnt = np.array(test[[col for col in test.columns if col.startswith('contents_attribute_h')]])
    result = (psn == cnt)
    result = pd.DataFrame(result, columns=[f'h{i}_match_yn' , f'h{i}_m_match_yn' , f'h{i}_l_match_yn'])
    temp = pd.concat([temp,result], axis=1)

test_1 = pd.concat([test,temp], axis=1)
# 속성 C, E 일치 컬럼 추가

train_1['c_match_yn'] = (train_1['person_prefer_c'] == train_1['contents_attribute_c'])
test_1['c_match_yn'] = (test_1['person_prefer_c'] == test_1['contents_attribute_c'])

train_1['e_match_yn'] = (train_1['person_prefer_e'] == train_1['contents_attribute_e'])
test_1['e_match_yn'] = (test_1['person_prefer_e'] == test_1['contents_attribute_e'])

필요한 컬럼 다 갖춘 데이터 완성

train_1.to_csv("train_1.csv")
test_1.to_csv("test_1.csv")
train_1 = pd.read_csv('train_1.csv').iloc[:,1:] 
test_1 = pd.read_csv('test_1.csv').iloc[:,1:]

데이터 불러오기

print(train_1.shape)
print(test_1.shape)
(501951, 89)
(46404, 88)

사용할 컬럼 정하기

  • contents_open_dt, contents_rn, id, person_rn은 제외(사용하면 성능이 더 떨어짐)
  • person_prefer_f, person_prefer_g는 모든 데이터에 대해서 동일한 값을 가지므로 제외
  • train데이터에서는 target도 우선 제외
cols = list(set(train_1.columns) - set(['contents_open_dt','contents_rn','id','person_rn','person_prefer_f','person_prefer_g','target']))
len(cols)
82

target 제외하고 총 82개의 컬럼을 사용함

train_temp = pd.concat([train_1[cols],train_1['target']], axis=1)
test_temp = test_1[cols]

# 명목형 변수 지정
categori_col = [
                #회원 속성
                'person_attribute_a',
                #회원 선호 속성
                'person_prefer_c',
                'person_prefer_d_1',
                'person_prefer_d_1_l','person_prefer_d_1_m','person_prefer_d_1_s','person_prefer_d_1_n',
                'person_prefer_d_2',
                'person_prefer_d_2_l','person_prefer_d_2_m','person_prefer_d_2_s','person_prefer_d_2_n',
                'person_prefer_d_3',
                'person_prefer_d_3_l','person_prefer_d_3_m','person_prefer_d_3_s','person_prefer_d_3_n',
                'person_prefer_h_1',
                'person_prefer_h_1_l','person_prefer_h_1_m',
                'person_prefer_h_2',
                'person_prefer_h_2_l','person_prefer_h_2_m',
                'person_prefer_h_3',
                'person_prefer_h_3_l','person_prefer_h_3_m',
                #컨텐츠 속성
                'contents_attribute_i', 'contents_attribute_a', 'contents_attribute_j_1', 'contents_attribute_j', 'contents_attribute_c',
                'contents_attribute_h',
                'contents_attribute_h_l','contents_attribute_h_m',
                'contents_attribute_d',
                'contents_attribute_d_l','contents_attribute_d_m','contents_attribute_d_s','contents_attribute_d_n',
                'contents_attribute_k','contents_attribute_m',
                'contents_attribute_l',
                'contents_attribute_l_l','contents_attribute_l_m','contents_attribute_l_s','contents_attribute_l_n',
                ]
# TRUE / FALSE를 1 / 0 으로 바꿔주기

for column in train_temp.select_dtypes(include='bool').columns:
    train_temp[column] = train_temp[column].astype(int)


for column in test_temp.select_dtypes(include='bool').columns:
    test_temp[column] = test_temp[column].astype(int)
# 명목형 변수는 데이터 타입 오브젝트로 바꿔놓기

train_temp[categori_col] = train_temp[categori_col].astype('object')
test_temp[categori_col] = test_temp[categori_col].astype('object')
train_temp.dtypes.value_counts()

# 범주형 46개
# 숫자형 37개
# 총 82 + 1개(target) 컬럼
object    46
int32     32
int64      5
dtype: int64
# 숫자형
train_temp.select_dtypes(exclude='object').apply(pd.Series.nunique, axis=0)
h_l_match_yn             2
h_s_match_yn             2
d2_match_yn              2
d3_m_match_yn            2
h3_l_match_yn            2
h1_m_match_yn            2
c_match_yn               2
d3_match_yn              2
d3_l_match_yn            2
d1_match_yn              2
d_s_match_yn             2
d3_s_match_yn            2
person_prefer_e         12
h1_match_yn              2
d2_l_match_yn            2
contents_attribute_e    12
e_match_yn               2
h2_l_match_yn            2
h2_m_match_yn            2
d_m_match_yn             2
person_attribute_b       6
h3_match_yn              2
d_l_match_yn             2
h1_l_match_yn            2
d1_m_match_yn            2
d2_n_match_yn            2
d1_n_match_yn            2
h2_match_yn              2
d1_s_match_yn            2
person_attribute_a_1     8
h_m_match_yn             2
d3_n_match_yn            2
d1_l_match_yn            2
h3_m_match_yn            2
d2_s_match_yn            2
d2_m_match_yn            2
target                   2
dtype: int64
# 범주형 변수
train_temp.select_dtypes('object').apply(pd.Series.nunique, axis=0)
contents_attribute_h       250
person_prefer_d_3_l         11
person_prefer_h_3          279
contents_attribute_k         2
contents_attribute_c         4
person_prefer_d_1_n        443
person_prefer_c              5
person_prefer_d_2_n        435
person_prefer_h_1_l         19
person_prefer_d_2         1081
person_prefer_d_2_s        137
contents_attribute_a         3
person_prefer_d_3_n        420
contents_attribute_d_s     137
person_prefer_h_2          279
contents_attribute_d_m      36
person_prefer_h_1_m        246
person_prefer_d_1_m         36
contents_attribute_j         2
contents_attribute_l_n     736
contents_attribute_m         5
person_prefer_d_2_m         36
person_prefer_d_1_l         11
person_prefer_h_1          279
contents_attribute_j_1       9
person_prefer_d_3_m         36
person_attribute_a           2
contents_attribute_l_m      79
person_prefer_d_1_s        137
person_prefer_h_2_l         19
contents_attribute_i         3
contents_attribute_l_s     305
person_prefer_h_3_m        246
contents_attribute_h_l      17
person_prefer_h_3_l         19
contents_attribute_d_l      11
person_prefer_d_1         1093
contents_attribute_d      1065
person_prefer_d_2_l         11
contents_attribute_h_m     228
person_prefer_d_3_s        136
contents_attribute_l_l      21
contents_attribute_l      1752
contents_attribute_d_n     431
person_prefer_d_3         1043
person_prefer_h_2_m        246
dtype: int64
train_temp.shape
(501951, 83)
test_temp.shape
(46404, 82)

Train에서 퍼포먼스 보기

# Train / Test 구분
X = train_temp[train_temp.columns.difference(['target'])]
y = train_temp[['target']]

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, stratify=y)
# Train에서 다시 Train / Validation 구분
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.125, stratify=train_y)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)
print(valid_x.shape)
print(valid_y.shape)
(351365, 82)
(351365, 1)
(100391, 82)
(100391, 1)
(50195, 82)
(50195, 1)

노트북이 안 좋아서 k-fold cv는 제출할 때만 사용할 예정, 우선 모델을 돌려보겠습니다.

# CatBoostClassifier 적합
seed = 116

cat = CatBoostClassifier(n_estimators=300, 
                         random_seed=seed, 
                         bootstrap_type ='Bernoulli',
                         eval_metric="F1",
                         one_hot_max_size = 4
                         )

cat.fit(train_x,train_y,
      eval_set = [(train_x, train_y), (valid_x, valid_y)],
      early_stopping_rounds = 50, 
      cat_features = categori_col,
      verbose = 10)
Learning rate set to 0.227595
0:	learn: 0.6003634	test: 0.6028502	test1: 0.5975898	best: 0.5975898 (0)	total: 6.52s	remaining: 32m 28s
10:	learn: 0.6408569	test: 0.7035863	test1: 0.6519279	best: 0.6519279 (10)	total: 56.4s	remaining: 24m 42s
20:	learn: 0.6467647	test: 0.7461439	test1: 0.6643557	best: 0.6643557 (20)	total: 1m 42s	remaining: 22m 35s
30:	learn: 0.6505808	test: 0.7705283	test1: 0.6698844	best: 0.6698844 (30)	total: 2m 28s	remaining: 21m 26s
40:	learn: 0.6523066	test: 0.7913883	test1: 0.6748939	best: 0.6748939 (40)	total: 3m 12s	remaining: 20m 14s
50:	learn: 0.6543541	test: 0.7934416	test1: 0.6746880	best: 0.6758294 (45)	total: 4m 9s	remaining: 20m 15s
60:	learn: 0.6559083	test: 0.7955448	test1: 0.6771044	best: 0.6774164 (59)	total: 5m 1s	remaining: 19m 40s
70:	learn: 0.6574470	test: 0.8012121	test1: 0.6791637	best: 0.6797244 (69)	total: 5m 42s	remaining: 18m 25s
80:	learn: 0.6593060	test: 0.8068754	test1: 0.6795986	best: 0.6799739 (76)	total: 6m 23s	remaining: 17m 16s
90:	learn: 0.6609507	test: 0.8116928	test1: 0.6816082	best: 0.6819080 (87)	total: 7m 7s	remaining: 16m 21s
100:	learn: 0.6621940	test: 0.8127825	test1: 0.6813540	best: 0.6819080 (87)	total: 8m 9s	remaining: 16m 3s
110:	learn: 0.6628987	test: 0.8167067	test1: 0.6827510	best: 0.6827510 (110)	total: 8m 51s	remaining: 15m 4s
120:	learn: 0.6640435	test: 0.8176961	test1: 0.6833405	best: 0.6835197 (119)	total: 9m 35s	remaining: 14m 11s
130:	learn: 0.6645559	test: 0.8183881	test1: 0.6837454	best: 0.6837652 (124)	total: 10m 20s	remaining: 13m 20s
140:	learn: 0.6653198	test: 0.8196152	test1: 0.6839979	best: 0.6843953 (138)	total: 11m 6s	remaining: 12m 31s
150:	learn: 0.6659125	test: 0.8203926	test1: 0.6837485	best: 0.6843953 (138)	total: 11m 52s	remaining: 11m 42s
160:	learn: 0.6662940	test: 0.8205866	test1: 0.6831761	best: 0.6843953 (138)	total: 12m 49s	remaining: 11m 4s
170:	learn: 0.6668303	test: 0.8204332	test1: 0.6838282	best: 0.6843953 (138)	total: 13m 40s	remaining: 10m 19s
180:	learn: 0.6678044	test: 0.8214092	test1: 0.6836710	best: 0.6843953 (138)	total: 14m 26s	remaining: 9m 29s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6843953089
bestIteration = 138

Shrink model to first 139 iterations.





<catboost.core.CatBoostClassifier at 0x1e5cf32f310>

threshold를 조정해주는 게 F1-score가 더 좋게 나와서 최적의 threshold값을 GridSearch로 찾아보겠습니다.

threashold = np.arange(0.3,0.61,0.01)
for i in threashold:
    temp = (pd.DataFrame(pred_prob) > i)
    pred = temp.iloc[:,1].astype('int')
    print(f"threashold : {round(i,3)} , F1_score : {f1_score(test_y,pred)}")
  0%|          | 0/31 [00:00<?, ?it/s]


threashold : 0.3 , F1_score : 0.7030653590933115
threashold : 0.31 , F1_score : 0.7043846614473597
threashold : 0.32 , F1_score : 0.7055239854322295
threashold : 0.33 , F1_score : 0.706482976087691
threashold : 0.34 , F1_score : 0.7072480009692271
threashold : 0.35 , F1_score : 0.7085153839102937
threashold : 0.36 , F1_score : 0.7092232485697102
threashold : 0.37 , F1_score : 0.7095925389633643
threashold : 0.38 , F1_score : 0.7100758362440605
threashold : 0.39 , F1_score : 0.7098318672753591
threashold : 0.4 , F1_score : 0.7098077077801915
threashold : 0.41 , F1_score : 0.7094778318706548
threashold : 0.42 , F1_score : 0.7083024156979141
threashold : 0.43 , F1_score : 0.7071526994864087
threashold : 0.44 , F1_score : 0.7054687764713855
threashold : 0.45 , F1_score : 0.7031824586727154
threashold : 0.46 , F1_score : 0.6999057788944724
threashold : 0.47 , F1_score : 0.6966325996968541
threashold : 0.48 , F1_score : 0.6925287615203741
threashold : 0.49 , F1_score : 0.6879715446036505
threashold : 0.5 , F1_score : 0.683328196650883
threashold : 0.51 , F1_score : 0.6770299196118537
threashold : 0.52 , F1_score : 0.6697723170092588
threashold : 0.53 , F1_score : 0.6623810136989015
threashold : 0.54 , F1_score : 0.6535716093757911
threashold : 0.55 , F1_score : 0.6432878188260158
threashold : 0.56 , F1_score : 0.6331234230222845
threashold : 0.57 , F1_score : 0.6213353898960494
threashold : 0.58 , F1_score : 0.6076155251548505
threashold : 0.59 , F1_score : 0.5922310119347021
threashold : 0.6 , F1_score : 0.5753981303142763

깔끔하게 0.4로 결정했습니다.

pred_prob = cat.predict_proba(test_x)
temp = (pd.DataFrame(pred_prob) > 0.4)
pred = temp.iloc[:,1].astype('int')
get_clf_eval(test_y['target'], pred)
정확도:0.6407247661642976, 정밀도:0.5951214905357456, 재현율:0.8792481412824653
F1 score : 0.7098077077801915

F1 score 기준 약 0.71정도의 성능을 보여줍니다.

테스트 데이터로 submission 제출하기

  • kfoldcv실행
train_y = train_temp['target']
train_x = train_temp.drop(['target'], axis=1)
%%time

seed = 2022
k = 5
models = []
result = []
folds = StratifiedKFold(n_splits = k, shuffle = True, random_state = seed)

for n_fold, (train_idx, valid_idx) in tqdm_notebook(enumerate(folds.split(train_x, train_y))):
    print(f'=================================={n_fold+1}==========================================')

    # Train / Valid 나누기
    train_x_fold, valid_x_fold = train_x.iloc[train_idx], train_x.iloc[valid_idx]
    train_y_fold, valid_y_fold = train_y.iloc[train_idx], train_y.iloc[valid_idx]

    # 모형 생성
    cat = CatBoostClassifier(n_estimators=300, random_seed=seed, bootstrap_type ='Bernoulli',eval_metric="F1", custom_loss='F1')

    # 모형 적합
    cat.fit(train_x_fold,train_y_fold,
        eval_set = [(train_x_fold, train_y_fold), (valid_x_fold, valid_y_fold)],
        early_stopping_rounds = 50, 
        cat_features = categori_col,
        verbose = 10)

    # 테스트 데이터 예측(확률값으로 받기)
    test_pred_prob = cat.predict_proba(test_temp)

    models.append(cat)
    result.append(test_pred_prob)

    print(f'================================================================================\n\n')


0it [00:00, ?it/s]


==================================1==========================================
Learning rate set to 0.235227
0:	learn: 0.5962659	test: 0.5967534	test1: 0.5945962	best: 0.5945962 (0)	total: 13s	remaining: 1h 4m 39s
10:	learn: 0.6420607	test: 0.7252085	test1: 0.6597472	best: 0.6597472 (10)	total: 1m 17s	remaining: 33m 47s
20:	learn: 0.6510418	test: 0.7609060	test1: 0.6713420	best: 0.6713420 (20)	total: 2m 15s	remaining: 30m 6s
30:	learn: 0.6537382	test: 0.7695586	test1: 0.6757894	best: 0.6757894 (30)	total: 3m 8s	remaining: 27m 19s
40:	learn: 0.6559261	test: 0.7805929	test1: 0.6775557	best: 0.6776516 (39)	total: 4m 4s	remaining: 25m 42s
50:	learn: 0.6578211	test: 0.7813806	test1: 0.6783032	best: 0.6786181 (48)	total: 4m 49s	remaining: 23m 35s
60:	learn: 0.6593789	test: 0.7928956	test1: 0.6805314	best: 0.6806777 (58)	total: 5m 37s	remaining: 22m 3s
70:	learn: 0.6608937	test: 0.7945964	test1: 0.6804401	best: 0.6810295 (62)	total: 6m 27s	remaining: 20m 48s
80:	learn: 0.6622386	test: 0.8050873	test1: 0.6835431	best: 0.6835431 (80)	total: 7m 21s	remaining: 19m 52s
90:	learn: 0.6635762	test: 0.8082712	test1: 0.6835577	best: 0.6843603 (87)	total: 8m 11s	remaining: 18m 47s
100:	learn: 0.6647179	test: 0.8113702	test1: 0.6848954	best: 0.6848954 (100)	total: 9m	remaining: 17m 45s
110:	learn: 0.6654384	test: 0.8114649	test1: 0.6847895	best: 0.6848954 (100)	total: 9m 48s	remaining: 16m 42s
120:	learn: 0.6661582	test: 0.8163833	test1: 0.6858433	best: 0.6861756 (118)	total: 10m 38s	remaining: 15m 44s
130:	learn: 0.6669289	test: 0.8167238	test1: 0.6859176	best: 0.6861756 (118)	total: 11m 27s	remaining: 14m 46s
140:	learn: 0.6676552	test: 0.8188321	test1: 0.6868088	best: 0.6869757 (133)	total: 12m 18s	remaining: 13m 52s
150:	learn: 0.6686924	test: 0.8192335	test1: 0.6863474	best: 0.6869974 (143)	total: 13m 16s	remaining: 13m 6s
160:	learn: 0.6691750	test: 0.8197694	test1: 0.6866646	best: 0.6869974 (143)	total: 14m 16s	remaining: 12m 19s
170:	learn: 0.6693290	test: 0.8210153	test1: 0.6867297	best: 0.6869974 (143)	total: 15m 8s	remaining: 11m 25s
180:	learn: 0.6700074	test: 0.8219643	test1: 0.6877937	best: 0.6877937 (180)	total: 16m 2s	remaining: 10m 32s
190:	learn: 0.6703896	test: 0.8225243	test1: 0.6880093	best: 0.6880326 (189)	total: 16m 55s	remaining: 9m 39s
200:	learn: 0.6709395	test: 0.8227387	test1: 0.6879566	best: 0.6881722 (196)	total: 17m 50s	remaining: 8m 47s
210:	learn: 0.6715486	test: 0.8229027	test1: 0.6878556	best: 0.6883883 (203)	total: 18m 45s	remaining: 7m 54s
220:	learn: 0.6718343	test: 0.8229727	test1: 0.6880716	best: 0.6883883 (203)	total: 19m 38s	remaining: 7m 1s
230:	learn: 0.6723365	test: 0.8230236	test1: 0.6881206	best: 0.6884229 (225)	total: 20m 33s	remaining: 6m 8s
240:	learn: 0.6728943	test: 0.8245618	test1: 0.6886997	best: 0.6886997 (240)	total: 21m 26s	remaining: 5m 15s
250:	learn: 0.6731640	test: 0.8247809	test1: 0.6883544	best: 0.6887221 (242)	total: 22m 18s	remaining: 4m 21s
260:	learn: 0.6735842	test: 0.8246325	test1: 0.6883028	best: 0.6887221 (242)	total: 23m 10s	remaining: 3m 27s
270:	learn: 0.6738376	test: 0.8250170	test1: 0.6879131	best: 0.6887221 (242)	total: 23m 59s	remaining: 2m 34s
280:	learn: 0.6743161	test: 0.8260126	test1: 0.6882345	best: 0.6887221 (242)	total: 24m 46s	remaining: 1m 40s
290:	learn: 0.6746999	test: 0.8260436	test1: 0.6880987	best: 0.6887221 (242)	total: 25m 38s	remaining: 47.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6887220573
bestIteration = 242

Shrink model to first 243 iterations.
================================================================================


==================================2==========================================
Learning rate set to 0.235227
0:	learn: 0.5833973	test: 0.5833520	test1: 0.5849483	best: 0.5849483 (0)	total: 6.93s	remaining: 34m 31s
10:	learn: 0.6405256	test: 0.7135342	test1: 0.6547645	best: 0.6547645 (10)	total: 1m 1s	remaining: 26m 44s
20:	learn: 0.6475149	test: 0.7695936	test1: 0.6689917	best: 0.6689917 (20)	total: 1m 50s	remaining: 24m 24s
30:	learn: 0.6529198	test: 0.7766768	test1: 0.6721959	best: 0.6723706 (29)	total: 2m 36s	remaining: 22m 33s
40:	learn: 0.6540265	test: 0.7822622	test1: 0.6742004	best: 0.6742004 (40)	total: 3m 23s	remaining: 21m 24s
50:	learn: 0.6570791	test: 0.7866966	test1: 0.6752796	best: 0.6752796 (50)	total: 4m 12s	remaining: 20m 30s
60:	learn: 0.6592152	test: 0.8024322	test1: 0.6802499	best: 0.6802499 (60)	total: 5m 2s	remaining: 19m 43s
70:	learn: 0.6613426	test: 0.8041085	test1: 0.6823285	best: 0.6824008 (68)	total: 5m 50s	remaining: 18m 49s
80:	learn: 0.6623772	test: 0.8087681	test1: 0.6833217	best: 0.6838120 (74)	total: 6m 38s	remaining: 17m 57s
90:	learn: 0.6636641	test: 0.8099415	test1: 0.6840324	best: 0.6841880 (89)	total: 7m 29s	remaining: 17m 12s
100:	learn: 0.6641158	test: 0.8124503	test1: 0.6844058	best: 0.6844058 (100)	total: 8m 24s	remaining: 16m 33s
110:	learn: 0.6651186	test: 0.8155927	test1: 0.6845911	best: 0.6845911 (110)	total: 9m 15s	remaining: 15m 45s
120:	learn: 0.6655697	test: 0.8171436	test1: 0.6851192	best: 0.6853091 (117)	total: 10m 4s	remaining: 14m 54s
130:	learn: 0.6661185	test: 0.8193981	test1: 0.6858256	best: 0.6859099 (129)	total: 10m 56s	remaining: 14m 7s
140:	learn: 0.6668357	test: 0.8227551	test1: 0.6870682	best: 0.6870682 (140)	total: 11m 51s	remaining: 13m 22s
150:	learn: 0.6673381	test: 0.8236013	test1: 0.6872794	best: 0.6874311 (149)	total: 12m 43s	remaining: 12m 33s
160:	learn: 0.6681053	test: 0.8252143	test1: 0.6872611	best: 0.6874533 (155)	total: 13m 33s	remaining: 11m 42s
170:	learn: 0.6686303	test: 0.8259186	test1: 0.6879214	best: 0.6879827 (168)	total: 14m 22s	remaining: 10m 50s
180:	learn: 0.6688569	test: 0.8261430	test1: 0.6875905	best: 0.6879827 (168)	total: 15m 11s	remaining: 9m 59s
190:	learn: 0.6694051	test: 0.8259791	test1: 0.6876940	best: 0.6879827 (168)	total: 16m 2s	remaining: 9m 9s
200:	learn: 0.6699648	test: 0.8272785	test1: 0.6881933	best: 0.6882394 (196)	total: 16m 56s	remaining: 8m 20s
210:	learn: 0.6703629	test: 0.8281450	test1: 0.6884431	best: 0.6885209 (205)	total: 17m 48s	remaining: 7m 30s
220:	learn: 0.6709914	test: 0.8290551	test1: 0.6889117	best: 0.6891334 (219)	total: 18m 39s	remaining: 6m 40s
230:	learn: 0.6715163	test: 0.8308608	test1: 0.6893840	best: 0.6895373 (227)	total: 19m 31s	remaining: 5m 49s
240:	learn: 0.6719799	test: 0.8318220	test1: 0.6893311	best: 0.6895843 (236)	total: 20m 19s	remaining: 4m 58s
250:	learn: 0.6723687	test: 0.8325428	test1: 0.6893355	best: 0.6895843 (236)	total: 21m 10s	remaining: 4m 7s
260:	learn: 0.6727277	test: 0.8324867	test1: 0.6893129	best: 0.6896539 (255)	total: 22m 4s	remaining: 3m 17s
270:	learn: 0.6730269	test: 0.8325291	test1: 0.6896326	best: 0.6896539 (255)	total: 22m 54s	remaining: 2m 27s
280:	learn: 0.6732117	test: 0.8333668	test1: 0.6900134	best: 0.6900134 (280)	total: 23m 47s	remaining: 1m 36s
290:	learn: 0.6738847	test: 0.8335661	test1: 0.6902571	best: 0.6902571 (290)	total: 24m 38s	remaining: 45.7s
299:	learn: 0.6742525	test: 0.8340623	test1: 0.6898802	best: 0.6902571 (290)	total: 25m 29s	remaining: 0us

bestTest = 0.6902570524
bestIteration = 290

Shrink model to first 291 iterations.
================================================================================


==================================3==========================================
Learning rate set to 0.235227
0:	learn: 0.6216538	test: 0.6229008	test1: 0.6232515	best: 0.6232515 (0)	total: 4.91s	remaining: 24m 28s
10:	learn: 0.6409321	test: 0.7196293	test1: 0.6572837	best: 0.6572837 (10)	total: 58s	remaining: 25m 24s
20:	learn: 0.6477092	test: 0.7508666	test1: 0.6703383	best: 0.6706611 (19)	total: 1m 46s	remaining: 23m 40s
30:	learn: 0.6530137	test: 0.7801107	test1: 0.6792608	best: 0.6792608 (30)	total: 2m 38s	remaining: 22m 53s
40:	learn: 0.6563886	test: 0.7880980	test1: 0.6822498	best: 0.6822498 (40)	total: 3m 26s	remaining: 21m 44s
50:	learn: 0.6584105	test: 0.7898914	test1: 0.6822932	best: 0.6826247 (45)	total: 4m 14s	remaining: 20m 40s
60:	learn: 0.6600730	test: 0.7955104	test1: 0.6835651	best: 0.6838458 (58)	total: 5m 2s	remaining: 19m 43s
70:	learn: 0.6618595	test: 0.7995003	test1: 0.6847355	best: 0.6850094 (68)	total: 5m 49s	remaining: 18m 47s
80:	learn: 0.6629089	test: 0.8038064	test1: 0.6862297	best: 0.6862297 (80)	total: 6m 38s	remaining: 17m 58s
90:	learn: 0.6642163	test: 0.8072907	test1: 0.6873777	best: 0.6873819 (86)	total: 7m 30s	remaining: 17m 14s
100:	learn: 0.6652178	test: 0.8099029	test1: 0.6871833	best: 0.6878281 (93)	total: 8m 21s	remaining: 16m 28s
110:	learn: 0.6660726	test: 0.8102764	test1: 0.6874057	best: 0.6878281 (93)	total: 9m 9s	remaining: 15m 36s
120:	learn: 0.6666968	test: 0.8105069	test1: 0.6875793	best: 0.6878281 (93)	total: 9m 57s	remaining: 14m 43s
130:	learn: 0.6674710	test: 0.8135577	test1: 0.6885519	best: 0.6885519 (130)	total: 10m 49s	remaining: 13m 57s
140:	learn: 0.6684035	test: 0.8146636	test1: 0.6890182	best: 0.6890182 (140)	total: 11m 39s	remaining: 13m 8s
150:	learn: 0.6686630	test: 0.8171446	test1: 0.6896200	best: 0.6899891 (146)	total: 12m 31s	remaining: 12m 21s
160:	learn: 0.6690546	test: 0.8183318	test1: 0.6899944	best: 0.6901300 (156)	total: 13m 34s	remaining: 11m 42s
170:	learn: 0.6695222	test: 0.8210363	test1: 0.6905764	best: 0.6906455 (169)	total: 14m 44s	remaining: 11m 7s
180:	learn: 0.6703622	test: 0.8230552	test1: 0.6913347	best: 0.6913347 (180)	total: 15m 38s	remaining: 10m 17s
190:	learn: 0.6705191	test: 0.8236182	test1: 0.6912294	best: 0.6913541 (182)	total: 16m 31s	remaining: 9m 25s
200:	learn: 0.6710788	test: 0.8245452	test1: 0.6905407	best: 0.6913541 (182)	total: 17m 20s	remaining: 8m 32s
210:	learn: 0.6716998	test: 0.8254383	test1: 0.6899529	best: 0.6913541 (182)	total: 18m 9s	remaining: 7m 39s
220:	learn: 0.6721097	test: 0.8255137	test1: 0.6896744	best: 0.6913541 (182)	total: 19m 3s	remaining: 6m 48s
230:	learn: 0.6723847	test: 0.8258951	test1: 0.6900970	best: 0.6913541 (182)	total: 19m 51s	remaining: 5m 55s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6913541358
bestIteration = 182

Shrink model to first 183 iterations.
================================================================================


==================================4==========================================
Learning rate set to 0.235227
0:	learn: 0.6088897	test: 0.6138620	test1: 0.6135232	best: 0.6135232 (0)	total: 9.63s	remaining: 47m 57s
10:	learn: 0.6413823	test: 0.7112563	test1: 0.6592475	best: 0.6593885 (9)	total: 1m 6s	remaining: 29m 7s
20:	learn: 0.6477426	test: 0.7446712	test1: 0.6684289	best: 0.6693861 (18)	total: 1m 57s	remaining: 26m 6s
30:	learn: 0.6517837	test: 0.7668160	test1: 0.6742435	best: 0.6742435 (30)	total: 2m 48s	remaining: 24m 20s
40:	learn: 0.6543537	test: 0.7806687	test1: 0.6772691	best: 0.6772691 (40)	total: 3m 40s	remaining: 23m 10s
50:	learn: 0.6572068	test: 0.7856670	test1: 0.6789735	best: 0.6791219 (47)	total: 4m 30s	remaining: 22m 2s
60:	learn: 0.6595529	test: 0.8010034	test1: 0.6835494	best: 0.6838487 (58)	total: 5m 22s	remaining: 21m 2s
70:	learn: 0.6609678	test: 0.8062261	test1: 0.6858258	best: 0.6858258 (70)	total: 6m 14s	remaining: 20m 7s
80:	learn: 0.6623339	test: 0.8121792	test1: 0.6859007	best: 0.6866375 (75)	total: 7m 5s	remaining: 19m 9s
90:	learn: 0.6636772	test: 0.8153475	test1: 0.6863057	best: 0.6866375 (75)	total: 7m 55s	remaining: 18m 11s
100:	learn: 0.6646229	test: 0.8169151	test1: 0.6870916	best: 0.6873799 (98)	total: 8m 48s	remaining: 17m 21s
110:	learn: 0.6659334	test: 0.8188139	test1: 0.6867931	best: 0.6873799 (98)	total: 9m 55s	remaining: 16m 54s
120:	learn: 0.6667287	test: 0.8193332	test1: 0.6870785	best: 0.6873799 (98)	total: 10m 46s	remaining: 15m 56s
130:	learn: 0.6676020	test: 0.8197830	test1: 0.6869205	best: 0.6873799 (98)	total: 11m 39s	remaining: 15m 2s
140:	learn: 0.6679969	test: 0.8199328	test1: 0.6869941	best: 0.6873799 (98)	total: 12m 29s	remaining: 14m 4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6873798906
bestIteration = 98

Shrink model to first 99 iterations.
================================================================================


==================================5==========================================
Learning rate set to 0.235227
0:	learn: 0.6485131	test: 0.6517068	test1: 0.6479244	best: 0.6479244 (0)	total: 5.08s	remaining: 25m 17s
10:	learn: 0.6395082	test: 0.7089322	test1: 0.6549795	best: 0.6549795 (10)	total: 1m 2s	remaining: 27m 9s
20:	learn: 0.6457125	test: 0.7400387	test1: 0.6659492	best: 0.6659492 (20)	total: 1m 53s	remaining: 25m 11s
30:	learn: 0.6514184	test: 0.7604515	test1: 0.6730497	best: 0.6730497 (30)	total: 2m 44s	remaining: 23m 43s
40:	learn: 0.6546387	test: 0.7762775	test1: 0.6760438	best: 0.6763032 (37)	total: 3m 29s	remaining: 22m 6s
50:	learn: 0.6567014	test: 0.7823814	test1: 0.6762968	best: 0.6766550 (46)	total: 4m 19s	remaining: 21m 5s
60:	learn: 0.6586215	test: 0.7945281	test1: 0.6778286	best: 0.6778465 (58)	total: 5m 10s	remaining: 20m 16s
70:	learn: 0.6601434	test: 0.7992026	test1: 0.6820142	best: 0.6820142 (70)	total: 6m 2s	remaining: 19m 28s
80:	learn: 0.6620918	test: 0.8051808	test1: 0.6835441	best: 0.6835441 (80)	total: 6m 51s	remaining: 18m 33s
90:	learn: 0.6632307	test: 0.8095564	test1: 0.6847451	best: 0.6852213 (89)	total: 7m 43s	remaining: 17m 43s
100:	learn: 0.6646119	test: 0.8102420	test1: 0.6855051	best: 0.6855051 (100)	total: 8m 35s	remaining: 16m 56s
110:	learn: 0.6661142	test: 0.8136297	test1: 0.6855520	best: 0.6858795 (107)	total: 9m 34s	remaining: 16m 17s
120:	learn: 0.6664666	test: 0.8142156	test1: 0.6856690	best: 0.6858795 (107)	total: 10m 27s	remaining: 15m 28s
130:	learn: 0.6671449	test: 0.8158779	test1: 0.6858933	best: 0.6858933 (130)	total: 11m 19s	remaining: 14m 37s
140:	learn: 0.6679393	test: 0.8168439	test1: 0.6859532	best: 0.6860749 (134)	total: 12m 12s	remaining: 13m 45s
150:	learn: 0.6682700	test: 0.8174497	test1: 0.6852882	best: 0.6861378 (145)	total: 13m 3s	remaining: 12m 52s
160:	learn: 0.6687193	test: 0.8181760	test1: 0.6852596	best: 0.6861378 (145)	total: 13m 53s	remaining: 11m 59s
170:	learn: 0.6690934	test: 0.8181678	test1: 0.6847725	best: 0.6861378 (145)	total: 14m 46s	remaining: 11m 8s
180:	learn: 0.6698601	test: 0.8202472	test1: 0.6854161	best: 0.6861378 (145)	total: 15m 38s	remaining: 10m 17s
190:	learn: 0.6703541	test: 0.8222802	test1: 0.6853611	best: 0.6861378 (145)	total: 16m 33s	remaining: 9m 26s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.6861377913
bestIteration = 145

Shrink model to first 146 iterations.
================================================================================


Wall time: 4h 33min 14s
sub = np.zeros([46404, 2])
for i in result:
    sub += i
pred_prob = sub/5
temp = (pd.DataFrame(pred_prob) > 0.4)
pred = temp.iloc[:,1].astype('int')

submission = pd.read_csv('sample_submission.csv')
submission['target'] = pred

submission.to_csv('Kfold_Catboost.csv', index=False)

카테고리:

업데이트: