[Dacon]잡케어 추천 알고리즘 경진대회
데이콘에서 실시했던 추천 알고리즘 경진대회에 참여해보았습니다.
범주형 feature가 많았던 데이터라서 Catboost를 이용해서 접근해 보았습니다.
아직 부족하지만 더 열심히 공부해보겠습니다!
라이브러리 불러오기
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.model_selection import KFold,GridSearchCV,train_test_split,StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
%matplotlib inline
import warnings
warnings.filterwarnings(action='ignore')
# score 계산 함수
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
def get_clf_eval(y_test, pred):
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
print('정확도:{}, 정밀도:{}, 재현율:{}'.format(accuracy, precision, recall))
print(f"F1 score : {f1_score(y_test,pred)}")
train = pd.read_csv("train.csv", encoding='euc-kr')
test = pd.read_csv("test.csv", encoding='euc-kr')
전처리
- 속성 D, H, L에 대해서 (컨텐츠 속성, 회원 선호 속성) 일치하는지 여부에 대한 컬럼을 (대,중,소,세분류) 생성
- bool컬럼 0,1로 전환
# D, H, L 코드 추가
d_code = pd.read_csv('속성_D_코드.csv', index_col=0, encoding='euc-kr').T.to_dict()
h_code = pd.read_csv('속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('속성_L_코드.csv', index_col=0).T.to_dict()
def add_code(df, d_code, h_code, l_code):
df = df.copy()
# D Code
df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])
df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])
df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])
df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])
# H Code
df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
# L Code
df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
return df
train = add_code(train, d_code, h_code, l_code)
test = add_code(test, d_code, h_code, l_code)
일치하는지에 대한 컬럼 생성(D,H)
temp = pd.DataFrame()
# D에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
psn = np.array(train[[col for col in train.columns if col.startswith(f'person_prefer_d_{i}')]])
cnt = np.array(train[[col for col in train.columns if col.startswith('contents_attribute_d')]])
result = (psn == cnt)
result = pd.DataFrame(result, columns=[f'd{i}_match_yn',f'd{i}_n_match_yn',f'd{i}_s_match_yn',f'd{i}_m_match_yn',f'd{i}_l_match_yn'])
temp = pd.concat([temp,result], axis=1)
# H에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
psn = np.array(train[[col for col in train.columns if col.startswith(f'person_prefer_h_{i}')]])
cnt = np.array(train[[col for col in train.columns if col.startswith('contents_attribute_h')]])
result = (psn == cnt)
result = pd.DataFrame(result, columns=[f'h{i}_match_yn' , f'h{i}_m_match_yn' , f'h{i}_l_match_yn'])
temp = pd.concat([temp,result], axis=1)
train_1 = pd.concat([train,temp], axis=1)
temp = pd.DataFrame()
# D에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
psn = np.array(test[[col for col in test.columns if col.startswith(f'person_prefer_d_{i}')]])
cnt = np.array(test[[col for col in test.columns if col.startswith('contents_attribute_d')]])
result = (psn == cnt)
result = pd.DataFrame(result, columns=[f'd{i}_match_yn',f'd{i}_n_match_yn',f'd{i}_s_match_yn',f'd{i}_m_match_yn',f'd{i}_l_match_yn'])
temp = pd.concat([temp,result], axis=1)
# H에 대한 일치 여부 컬럼 생성
for i in [1,2,3]:
psn = np.array(test[[col for col in test.columns if col.startswith(f'person_prefer_h_{i}')]])
cnt = np.array(test[[col for col in test.columns if col.startswith('contents_attribute_h')]])
result = (psn == cnt)
result = pd.DataFrame(result, columns=[f'h{i}_match_yn' , f'h{i}_m_match_yn' , f'h{i}_l_match_yn'])
temp = pd.concat([temp,result], axis=1)
test_1 = pd.concat([test,temp], axis=1)
# 속성 C, E 일치 컬럼 추가
train_1['c_match_yn'] = (train_1['person_prefer_c'] == train_1['contents_attribute_c'])
test_1['c_match_yn'] = (test_1['person_prefer_c'] == test_1['contents_attribute_c'])
train_1['e_match_yn'] = (train_1['person_prefer_e'] == train_1['contents_attribute_e'])
test_1['e_match_yn'] = (test_1['person_prefer_e'] == test_1['contents_attribute_e'])
필요한 컬럼 다 갖춘 데이터 완성
train_1.to_csv("train_1.csv")
test_1.to_csv("test_1.csv")
train_1 = pd.read_csv('train_1.csv').iloc[:,1:]
test_1 = pd.read_csv('test_1.csv').iloc[:,1:]
데이터 불러오기
print(train_1.shape)
print(test_1.shape)
(501951, 89)
(46404, 88)
사용할 컬럼 정하기
- contents_open_dt, contents_rn, id, person_rn은 제외(사용하면 성능이 더 떨어짐)
- person_prefer_f, person_prefer_g는 모든 데이터에 대해서 동일한 값을 가지므로 제외
- train데이터에서는 target도 우선 제외
cols = list(set(train_1.columns) - set(['contents_open_dt','contents_rn','id','person_rn','person_prefer_f','person_prefer_g','target']))
len(cols)
82
target 제외하고 총 82개의 컬럼을 사용함
train_temp = pd.concat([train_1[cols],train_1['target']], axis=1)
test_temp = test_1[cols]
# 명목형 변수 지정
categori_col = [
#회원 속성
'person_attribute_a',
#회원 선호 속성
'person_prefer_c',
'person_prefer_d_1',
'person_prefer_d_1_l','person_prefer_d_1_m','person_prefer_d_1_s','person_prefer_d_1_n',
'person_prefer_d_2',
'person_prefer_d_2_l','person_prefer_d_2_m','person_prefer_d_2_s','person_prefer_d_2_n',
'person_prefer_d_3',
'person_prefer_d_3_l','person_prefer_d_3_m','person_prefer_d_3_s','person_prefer_d_3_n',
'person_prefer_h_1',
'person_prefer_h_1_l','person_prefer_h_1_m',
'person_prefer_h_2',
'person_prefer_h_2_l','person_prefer_h_2_m',
'person_prefer_h_3',
'person_prefer_h_3_l','person_prefer_h_3_m',
#컨텐츠 속성
'contents_attribute_i', 'contents_attribute_a', 'contents_attribute_j_1', 'contents_attribute_j', 'contents_attribute_c',
'contents_attribute_h',
'contents_attribute_h_l','contents_attribute_h_m',
'contents_attribute_d',
'contents_attribute_d_l','contents_attribute_d_m','contents_attribute_d_s','contents_attribute_d_n',
'contents_attribute_k','contents_attribute_m',
'contents_attribute_l',
'contents_attribute_l_l','contents_attribute_l_m','contents_attribute_l_s','contents_attribute_l_n',
]
# TRUE / FALSE를 1 / 0 으로 바꿔주기
for column in train_temp.select_dtypes(include='bool').columns:
train_temp[column] = train_temp[column].astype(int)
for column in test_temp.select_dtypes(include='bool').columns:
test_temp[column] = test_temp[column].astype(int)
# 명목형 변수는 데이터 타입 오브젝트로 바꿔놓기
train_temp[categori_col] = train_temp[categori_col].astype('object')
test_temp[categori_col] = test_temp[categori_col].astype('object')
train_temp.dtypes.value_counts()
# 범주형 46개
# 숫자형 37개
# 총 82 + 1개(target) 컬럼
object 46
int32 32
int64 5
dtype: int64
# 숫자형
train_temp.select_dtypes(exclude='object').apply(pd.Series.nunique, axis=0)
h_l_match_yn 2
h_s_match_yn 2
d2_match_yn 2
d3_m_match_yn 2
h3_l_match_yn 2
h1_m_match_yn 2
c_match_yn 2
d3_match_yn 2
d3_l_match_yn 2
d1_match_yn 2
d_s_match_yn 2
d3_s_match_yn 2
person_prefer_e 12
h1_match_yn 2
d2_l_match_yn 2
contents_attribute_e 12
e_match_yn 2
h2_l_match_yn 2
h2_m_match_yn 2
d_m_match_yn 2
person_attribute_b 6
h3_match_yn 2
d_l_match_yn 2
h1_l_match_yn 2
d1_m_match_yn 2
d2_n_match_yn 2
d1_n_match_yn 2
h2_match_yn 2
d1_s_match_yn 2
person_attribute_a_1 8
h_m_match_yn 2
d3_n_match_yn 2
d1_l_match_yn 2
h3_m_match_yn 2
d2_s_match_yn 2
d2_m_match_yn 2
target 2
dtype: int64
# 범주형 변수
train_temp.select_dtypes('object').apply(pd.Series.nunique, axis=0)
contents_attribute_h 250
person_prefer_d_3_l 11
person_prefer_h_3 279
contents_attribute_k 2
contents_attribute_c 4
person_prefer_d_1_n 443
person_prefer_c 5
person_prefer_d_2_n 435
person_prefer_h_1_l 19
person_prefer_d_2 1081
person_prefer_d_2_s 137
contents_attribute_a 3
person_prefer_d_3_n 420
contents_attribute_d_s 137
person_prefer_h_2 279
contents_attribute_d_m 36
person_prefer_h_1_m 246
person_prefer_d_1_m 36
contents_attribute_j 2
contents_attribute_l_n 736
contents_attribute_m 5
person_prefer_d_2_m 36
person_prefer_d_1_l 11
person_prefer_h_1 279
contents_attribute_j_1 9
person_prefer_d_3_m 36
person_attribute_a 2
contents_attribute_l_m 79
person_prefer_d_1_s 137
person_prefer_h_2_l 19
contents_attribute_i 3
contents_attribute_l_s 305
person_prefer_h_3_m 246
contents_attribute_h_l 17
person_prefer_h_3_l 19
contents_attribute_d_l 11
person_prefer_d_1 1093
contents_attribute_d 1065
person_prefer_d_2_l 11
contents_attribute_h_m 228
person_prefer_d_3_s 136
contents_attribute_l_l 21
contents_attribute_l 1752
contents_attribute_d_n 431
person_prefer_d_3 1043
person_prefer_h_2_m 246
dtype: int64
train_temp.shape
(501951, 83)
test_temp.shape
(46404, 82)
Train에서 퍼포먼스 보기
# Train / Test 구분
X = train_temp[train_temp.columns.difference(['target'])]
y = train_temp[['target']]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, stratify=y)
# Train에서 다시 Train / Validation 구분
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.125, stratify=train_y)
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)
print(test_y.shape)
print(valid_x.shape)
print(valid_y.shape)
(351365, 82)
(351365, 1)
(100391, 82)
(100391, 1)
(50195, 82)
(50195, 1)
노트북이 안 좋아서 k-fold cv는 제출할 때만 사용할 예정, 우선 모델을 돌려보겠습니다.
# CatBoostClassifier 적합
seed = 116
cat = CatBoostClassifier(n_estimators=300,
random_seed=seed,
bootstrap_type ='Bernoulli',
eval_metric="F1",
one_hot_max_size = 4
)
cat.fit(train_x,train_y,
eval_set = [(train_x, train_y), (valid_x, valid_y)],
early_stopping_rounds = 50,
cat_features = categori_col,
verbose = 10)
Learning rate set to 0.227595
0: learn: 0.6003634 test: 0.6028502 test1: 0.5975898 best: 0.5975898 (0) total: 6.52s remaining: 32m 28s
10: learn: 0.6408569 test: 0.7035863 test1: 0.6519279 best: 0.6519279 (10) total: 56.4s remaining: 24m 42s
20: learn: 0.6467647 test: 0.7461439 test1: 0.6643557 best: 0.6643557 (20) total: 1m 42s remaining: 22m 35s
30: learn: 0.6505808 test: 0.7705283 test1: 0.6698844 best: 0.6698844 (30) total: 2m 28s remaining: 21m 26s
40: learn: 0.6523066 test: 0.7913883 test1: 0.6748939 best: 0.6748939 (40) total: 3m 12s remaining: 20m 14s
50: learn: 0.6543541 test: 0.7934416 test1: 0.6746880 best: 0.6758294 (45) total: 4m 9s remaining: 20m 15s
60: learn: 0.6559083 test: 0.7955448 test1: 0.6771044 best: 0.6774164 (59) total: 5m 1s remaining: 19m 40s
70: learn: 0.6574470 test: 0.8012121 test1: 0.6791637 best: 0.6797244 (69) total: 5m 42s remaining: 18m 25s
80: learn: 0.6593060 test: 0.8068754 test1: 0.6795986 best: 0.6799739 (76) total: 6m 23s remaining: 17m 16s
90: learn: 0.6609507 test: 0.8116928 test1: 0.6816082 best: 0.6819080 (87) total: 7m 7s remaining: 16m 21s
100: learn: 0.6621940 test: 0.8127825 test1: 0.6813540 best: 0.6819080 (87) total: 8m 9s remaining: 16m 3s
110: learn: 0.6628987 test: 0.8167067 test1: 0.6827510 best: 0.6827510 (110) total: 8m 51s remaining: 15m 4s
120: learn: 0.6640435 test: 0.8176961 test1: 0.6833405 best: 0.6835197 (119) total: 9m 35s remaining: 14m 11s
130: learn: 0.6645559 test: 0.8183881 test1: 0.6837454 best: 0.6837652 (124) total: 10m 20s remaining: 13m 20s
140: learn: 0.6653198 test: 0.8196152 test1: 0.6839979 best: 0.6843953 (138) total: 11m 6s remaining: 12m 31s
150: learn: 0.6659125 test: 0.8203926 test1: 0.6837485 best: 0.6843953 (138) total: 11m 52s remaining: 11m 42s
160: learn: 0.6662940 test: 0.8205866 test1: 0.6831761 best: 0.6843953 (138) total: 12m 49s remaining: 11m 4s
170: learn: 0.6668303 test: 0.8204332 test1: 0.6838282 best: 0.6843953 (138) total: 13m 40s remaining: 10m 19s
180: learn: 0.6678044 test: 0.8214092 test1: 0.6836710 best: 0.6843953 (138) total: 14m 26s remaining: 9m 29s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.6843953089
bestIteration = 138
Shrink model to first 139 iterations.
<catboost.core.CatBoostClassifier at 0x1e5cf32f310>
threshold를 조정해주는 게 F1-score가 더 좋게 나와서 최적의 threshold값을 GridSearch로 찾아보겠습니다.
threashold = np.arange(0.3,0.61,0.01)
for i in threashold:
temp = (pd.DataFrame(pred_prob) > i)
pred = temp.iloc[:,1].astype('int')
print(f"threashold : {round(i,3)} , F1_score : {f1_score(test_y,pred)}")
0%| | 0/31 [00:00<?, ?it/s]
threashold : 0.3 , F1_score : 0.7030653590933115
threashold : 0.31 , F1_score : 0.7043846614473597
threashold : 0.32 , F1_score : 0.7055239854322295
threashold : 0.33 , F1_score : 0.706482976087691
threashold : 0.34 , F1_score : 0.7072480009692271
threashold : 0.35 , F1_score : 0.7085153839102937
threashold : 0.36 , F1_score : 0.7092232485697102
threashold : 0.37 , F1_score : 0.7095925389633643
threashold : 0.38 , F1_score : 0.7100758362440605
threashold : 0.39 , F1_score : 0.7098318672753591
threashold : 0.4 , F1_score : 0.7098077077801915
threashold : 0.41 , F1_score : 0.7094778318706548
threashold : 0.42 , F1_score : 0.7083024156979141
threashold : 0.43 , F1_score : 0.7071526994864087
threashold : 0.44 , F1_score : 0.7054687764713855
threashold : 0.45 , F1_score : 0.7031824586727154
threashold : 0.46 , F1_score : 0.6999057788944724
threashold : 0.47 , F1_score : 0.6966325996968541
threashold : 0.48 , F1_score : 0.6925287615203741
threashold : 0.49 , F1_score : 0.6879715446036505
threashold : 0.5 , F1_score : 0.683328196650883
threashold : 0.51 , F1_score : 0.6770299196118537
threashold : 0.52 , F1_score : 0.6697723170092588
threashold : 0.53 , F1_score : 0.6623810136989015
threashold : 0.54 , F1_score : 0.6535716093757911
threashold : 0.55 , F1_score : 0.6432878188260158
threashold : 0.56 , F1_score : 0.6331234230222845
threashold : 0.57 , F1_score : 0.6213353898960494
threashold : 0.58 , F1_score : 0.6076155251548505
threashold : 0.59 , F1_score : 0.5922310119347021
threashold : 0.6 , F1_score : 0.5753981303142763
깔끔하게 0.4로 결정했습니다.
pred_prob = cat.predict_proba(test_x)
temp = (pd.DataFrame(pred_prob) > 0.4)
pred = temp.iloc[:,1].astype('int')
get_clf_eval(test_y['target'], pred)
정확도:0.6407247661642976, 정밀도:0.5951214905357456, 재현율:0.8792481412824653
F1 score : 0.7098077077801915
F1 score 기준 약 0.71정도의 성능을 보여줍니다.
테스트 데이터로 submission 제출하기
- kfoldcv실행
train_y = train_temp['target']
train_x = train_temp.drop(['target'], axis=1)
%%time
seed = 2022
k = 5
models = []
result = []
folds = StratifiedKFold(n_splits = k, shuffle = True, random_state = seed)
for n_fold, (train_idx, valid_idx) in tqdm_notebook(enumerate(folds.split(train_x, train_y))):
print(f'=================================={n_fold+1}==========================================')
# Train / Valid 나누기
train_x_fold, valid_x_fold = train_x.iloc[train_idx], train_x.iloc[valid_idx]
train_y_fold, valid_y_fold = train_y.iloc[train_idx], train_y.iloc[valid_idx]
# 모형 생성
cat = CatBoostClassifier(n_estimators=300, random_seed=seed, bootstrap_type ='Bernoulli',eval_metric="F1", custom_loss='F1')
# 모형 적합
cat.fit(train_x_fold,train_y_fold,
eval_set = [(train_x_fold, train_y_fold), (valid_x_fold, valid_y_fold)],
early_stopping_rounds = 50,
cat_features = categori_col,
verbose = 10)
# 테스트 데이터 예측(확률값으로 받기)
test_pred_prob = cat.predict_proba(test_temp)
models.append(cat)
result.append(test_pred_prob)
print(f'================================================================================\n\n')
0it [00:00, ?it/s]
==================================1==========================================
Learning rate set to 0.235227
0: learn: 0.5962659 test: 0.5967534 test1: 0.5945962 best: 0.5945962 (0) total: 13s remaining: 1h 4m 39s
10: learn: 0.6420607 test: 0.7252085 test1: 0.6597472 best: 0.6597472 (10) total: 1m 17s remaining: 33m 47s
20: learn: 0.6510418 test: 0.7609060 test1: 0.6713420 best: 0.6713420 (20) total: 2m 15s remaining: 30m 6s
30: learn: 0.6537382 test: 0.7695586 test1: 0.6757894 best: 0.6757894 (30) total: 3m 8s remaining: 27m 19s
40: learn: 0.6559261 test: 0.7805929 test1: 0.6775557 best: 0.6776516 (39) total: 4m 4s remaining: 25m 42s
50: learn: 0.6578211 test: 0.7813806 test1: 0.6783032 best: 0.6786181 (48) total: 4m 49s remaining: 23m 35s
60: learn: 0.6593789 test: 0.7928956 test1: 0.6805314 best: 0.6806777 (58) total: 5m 37s remaining: 22m 3s
70: learn: 0.6608937 test: 0.7945964 test1: 0.6804401 best: 0.6810295 (62) total: 6m 27s remaining: 20m 48s
80: learn: 0.6622386 test: 0.8050873 test1: 0.6835431 best: 0.6835431 (80) total: 7m 21s remaining: 19m 52s
90: learn: 0.6635762 test: 0.8082712 test1: 0.6835577 best: 0.6843603 (87) total: 8m 11s remaining: 18m 47s
100: learn: 0.6647179 test: 0.8113702 test1: 0.6848954 best: 0.6848954 (100) total: 9m remaining: 17m 45s
110: learn: 0.6654384 test: 0.8114649 test1: 0.6847895 best: 0.6848954 (100) total: 9m 48s remaining: 16m 42s
120: learn: 0.6661582 test: 0.8163833 test1: 0.6858433 best: 0.6861756 (118) total: 10m 38s remaining: 15m 44s
130: learn: 0.6669289 test: 0.8167238 test1: 0.6859176 best: 0.6861756 (118) total: 11m 27s remaining: 14m 46s
140: learn: 0.6676552 test: 0.8188321 test1: 0.6868088 best: 0.6869757 (133) total: 12m 18s remaining: 13m 52s
150: learn: 0.6686924 test: 0.8192335 test1: 0.6863474 best: 0.6869974 (143) total: 13m 16s remaining: 13m 6s
160: learn: 0.6691750 test: 0.8197694 test1: 0.6866646 best: 0.6869974 (143) total: 14m 16s remaining: 12m 19s
170: learn: 0.6693290 test: 0.8210153 test1: 0.6867297 best: 0.6869974 (143) total: 15m 8s remaining: 11m 25s
180: learn: 0.6700074 test: 0.8219643 test1: 0.6877937 best: 0.6877937 (180) total: 16m 2s remaining: 10m 32s
190: learn: 0.6703896 test: 0.8225243 test1: 0.6880093 best: 0.6880326 (189) total: 16m 55s remaining: 9m 39s
200: learn: 0.6709395 test: 0.8227387 test1: 0.6879566 best: 0.6881722 (196) total: 17m 50s remaining: 8m 47s
210: learn: 0.6715486 test: 0.8229027 test1: 0.6878556 best: 0.6883883 (203) total: 18m 45s remaining: 7m 54s
220: learn: 0.6718343 test: 0.8229727 test1: 0.6880716 best: 0.6883883 (203) total: 19m 38s remaining: 7m 1s
230: learn: 0.6723365 test: 0.8230236 test1: 0.6881206 best: 0.6884229 (225) total: 20m 33s remaining: 6m 8s
240: learn: 0.6728943 test: 0.8245618 test1: 0.6886997 best: 0.6886997 (240) total: 21m 26s remaining: 5m 15s
250: learn: 0.6731640 test: 0.8247809 test1: 0.6883544 best: 0.6887221 (242) total: 22m 18s remaining: 4m 21s
260: learn: 0.6735842 test: 0.8246325 test1: 0.6883028 best: 0.6887221 (242) total: 23m 10s remaining: 3m 27s
270: learn: 0.6738376 test: 0.8250170 test1: 0.6879131 best: 0.6887221 (242) total: 23m 59s remaining: 2m 34s
280: learn: 0.6743161 test: 0.8260126 test1: 0.6882345 best: 0.6887221 (242) total: 24m 46s remaining: 1m 40s
290: learn: 0.6746999 test: 0.8260436 test1: 0.6880987 best: 0.6887221 (242) total: 25m 38s remaining: 47.6s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.6887220573
bestIteration = 242
Shrink model to first 243 iterations.
================================================================================
==================================2==========================================
Learning rate set to 0.235227
0: learn: 0.5833973 test: 0.5833520 test1: 0.5849483 best: 0.5849483 (0) total: 6.93s remaining: 34m 31s
10: learn: 0.6405256 test: 0.7135342 test1: 0.6547645 best: 0.6547645 (10) total: 1m 1s remaining: 26m 44s
20: learn: 0.6475149 test: 0.7695936 test1: 0.6689917 best: 0.6689917 (20) total: 1m 50s remaining: 24m 24s
30: learn: 0.6529198 test: 0.7766768 test1: 0.6721959 best: 0.6723706 (29) total: 2m 36s remaining: 22m 33s
40: learn: 0.6540265 test: 0.7822622 test1: 0.6742004 best: 0.6742004 (40) total: 3m 23s remaining: 21m 24s
50: learn: 0.6570791 test: 0.7866966 test1: 0.6752796 best: 0.6752796 (50) total: 4m 12s remaining: 20m 30s
60: learn: 0.6592152 test: 0.8024322 test1: 0.6802499 best: 0.6802499 (60) total: 5m 2s remaining: 19m 43s
70: learn: 0.6613426 test: 0.8041085 test1: 0.6823285 best: 0.6824008 (68) total: 5m 50s remaining: 18m 49s
80: learn: 0.6623772 test: 0.8087681 test1: 0.6833217 best: 0.6838120 (74) total: 6m 38s remaining: 17m 57s
90: learn: 0.6636641 test: 0.8099415 test1: 0.6840324 best: 0.6841880 (89) total: 7m 29s remaining: 17m 12s
100: learn: 0.6641158 test: 0.8124503 test1: 0.6844058 best: 0.6844058 (100) total: 8m 24s remaining: 16m 33s
110: learn: 0.6651186 test: 0.8155927 test1: 0.6845911 best: 0.6845911 (110) total: 9m 15s remaining: 15m 45s
120: learn: 0.6655697 test: 0.8171436 test1: 0.6851192 best: 0.6853091 (117) total: 10m 4s remaining: 14m 54s
130: learn: 0.6661185 test: 0.8193981 test1: 0.6858256 best: 0.6859099 (129) total: 10m 56s remaining: 14m 7s
140: learn: 0.6668357 test: 0.8227551 test1: 0.6870682 best: 0.6870682 (140) total: 11m 51s remaining: 13m 22s
150: learn: 0.6673381 test: 0.8236013 test1: 0.6872794 best: 0.6874311 (149) total: 12m 43s remaining: 12m 33s
160: learn: 0.6681053 test: 0.8252143 test1: 0.6872611 best: 0.6874533 (155) total: 13m 33s remaining: 11m 42s
170: learn: 0.6686303 test: 0.8259186 test1: 0.6879214 best: 0.6879827 (168) total: 14m 22s remaining: 10m 50s
180: learn: 0.6688569 test: 0.8261430 test1: 0.6875905 best: 0.6879827 (168) total: 15m 11s remaining: 9m 59s
190: learn: 0.6694051 test: 0.8259791 test1: 0.6876940 best: 0.6879827 (168) total: 16m 2s remaining: 9m 9s
200: learn: 0.6699648 test: 0.8272785 test1: 0.6881933 best: 0.6882394 (196) total: 16m 56s remaining: 8m 20s
210: learn: 0.6703629 test: 0.8281450 test1: 0.6884431 best: 0.6885209 (205) total: 17m 48s remaining: 7m 30s
220: learn: 0.6709914 test: 0.8290551 test1: 0.6889117 best: 0.6891334 (219) total: 18m 39s remaining: 6m 40s
230: learn: 0.6715163 test: 0.8308608 test1: 0.6893840 best: 0.6895373 (227) total: 19m 31s remaining: 5m 49s
240: learn: 0.6719799 test: 0.8318220 test1: 0.6893311 best: 0.6895843 (236) total: 20m 19s remaining: 4m 58s
250: learn: 0.6723687 test: 0.8325428 test1: 0.6893355 best: 0.6895843 (236) total: 21m 10s remaining: 4m 7s
260: learn: 0.6727277 test: 0.8324867 test1: 0.6893129 best: 0.6896539 (255) total: 22m 4s remaining: 3m 17s
270: learn: 0.6730269 test: 0.8325291 test1: 0.6896326 best: 0.6896539 (255) total: 22m 54s remaining: 2m 27s
280: learn: 0.6732117 test: 0.8333668 test1: 0.6900134 best: 0.6900134 (280) total: 23m 47s remaining: 1m 36s
290: learn: 0.6738847 test: 0.8335661 test1: 0.6902571 best: 0.6902571 (290) total: 24m 38s remaining: 45.7s
299: learn: 0.6742525 test: 0.8340623 test1: 0.6898802 best: 0.6902571 (290) total: 25m 29s remaining: 0us
bestTest = 0.6902570524
bestIteration = 290
Shrink model to first 291 iterations.
================================================================================
==================================3==========================================
Learning rate set to 0.235227
0: learn: 0.6216538 test: 0.6229008 test1: 0.6232515 best: 0.6232515 (0) total: 4.91s remaining: 24m 28s
10: learn: 0.6409321 test: 0.7196293 test1: 0.6572837 best: 0.6572837 (10) total: 58s remaining: 25m 24s
20: learn: 0.6477092 test: 0.7508666 test1: 0.6703383 best: 0.6706611 (19) total: 1m 46s remaining: 23m 40s
30: learn: 0.6530137 test: 0.7801107 test1: 0.6792608 best: 0.6792608 (30) total: 2m 38s remaining: 22m 53s
40: learn: 0.6563886 test: 0.7880980 test1: 0.6822498 best: 0.6822498 (40) total: 3m 26s remaining: 21m 44s
50: learn: 0.6584105 test: 0.7898914 test1: 0.6822932 best: 0.6826247 (45) total: 4m 14s remaining: 20m 40s
60: learn: 0.6600730 test: 0.7955104 test1: 0.6835651 best: 0.6838458 (58) total: 5m 2s remaining: 19m 43s
70: learn: 0.6618595 test: 0.7995003 test1: 0.6847355 best: 0.6850094 (68) total: 5m 49s remaining: 18m 47s
80: learn: 0.6629089 test: 0.8038064 test1: 0.6862297 best: 0.6862297 (80) total: 6m 38s remaining: 17m 58s
90: learn: 0.6642163 test: 0.8072907 test1: 0.6873777 best: 0.6873819 (86) total: 7m 30s remaining: 17m 14s
100: learn: 0.6652178 test: 0.8099029 test1: 0.6871833 best: 0.6878281 (93) total: 8m 21s remaining: 16m 28s
110: learn: 0.6660726 test: 0.8102764 test1: 0.6874057 best: 0.6878281 (93) total: 9m 9s remaining: 15m 36s
120: learn: 0.6666968 test: 0.8105069 test1: 0.6875793 best: 0.6878281 (93) total: 9m 57s remaining: 14m 43s
130: learn: 0.6674710 test: 0.8135577 test1: 0.6885519 best: 0.6885519 (130) total: 10m 49s remaining: 13m 57s
140: learn: 0.6684035 test: 0.8146636 test1: 0.6890182 best: 0.6890182 (140) total: 11m 39s remaining: 13m 8s
150: learn: 0.6686630 test: 0.8171446 test1: 0.6896200 best: 0.6899891 (146) total: 12m 31s remaining: 12m 21s
160: learn: 0.6690546 test: 0.8183318 test1: 0.6899944 best: 0.6901300 (156) total: 13m 34s remaining: 11m 42s
170: learn: 0.6695222 test: 0.8210363 test1: 0.6905764 best: 0.6906455 (169) total: 14m 44s remaining: 11m 7s
180: learn: 0.6703622 test: 0.8230552 test1: 0.6913347 best: 0.6913347 (180) total: 15m 38s remaining: 10m 17s
190: learn: 0.6705191 test: 0.8236182 test1: 0.6912294 best: 0.6913541 (182) total: 16m 31s remaining: 9m 25s
200: learn: 0.6710788 test: 0.8245452 test1: 0.6905407 best: 0.6913541 (182) total: 17m 20s remaining: 8m 32s
210: learn: 0.6716998 test: 0.8254383 test1: 0.6899529 best: 0.6913541 (182) total: 18m 9s remaining: 7m 39s
220: learn: 0.6721097 test: 0.8255137 test1: 0.6896744 best: 0.6913541 (182) total: 19m 3s remaining: 6m 48s
230: learn: 0.6723847 test: 0.8258951 test1: 0.6900970 best: 0.6913541 (182) total: 19m 51s remaining: 5m 55s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.6913541358
bestIteration = 182
Shrink model to first 183 iterations.
================================================================================
==================================4==========================================
Learning rate set to 0.235227
0: learn: 0.6088897 test: 0.6138620 test1: 0.6135232 best: 0.6135232 (0) total: 9.63s remaining: 47m 57s
10: learn: 0.6413823 test: 0.7112563 test1: 0.6592475 best: 0.6593885 (9) total: 1m 6s remaining: 29m 7s
20: learn: 0.6477426 test: 0.7446712 test1: 0.6684289 best: 0.6693861 (18) total: 1m 57s remaining: 26m 6s
30: learn: 0.6517837 test: 0.7668160 test1: 0.6742435 best: 0.6742435 (30) total: 2m 48s remaining: 24m 20s
40: learn: 0.6543537 test: 0.7806687 test1: 0.6772691 best: 0.6772691 (40) total: 3m 40s remaining: 23m 10s
50: learn: 0.6572068 test: 0.7856670 test1: 0.6789735 best: 0.6791219 (47) total: 4m 30s remaining: 22m 2s
60: learn: 0.6595529 test: 0.8010034 test1: 0.6835494 best: 0.6838487 (58) total: 5m 22s remaining: 21m 2s
70: learn: 0.6609678 test: 0.8062261 test1: 0.6858258 best: 0.6858258 (70) total: 6m 14s remaining: 20m 7s
80: learn: 0.6623339 test: 0.8121792 test1: 0.6859007 best: 0.6866375 (75) total: 7m 5s remaining: 19m 9s
90: learn: 0.6636772 test: 0.8153475 test1: 0.6863057 best: 0.6866375 (75) total: 7m 55s remaining: 18m 11s
100: learn: 0.6646229 test: 0.8169151 test1: 0.6870916 best: 0.6873799 (98) total: 8m 48s remaining: 17m 21s
110: learn: 0.6659334 test: 0.8188139 test1: 0.6867931 best: 0.6873799 (98) total: 9m 55s remaining: 16m 54s
120: learn: 0.6667287 test: 0.8193332 test1: 0.6870785 best: 0.6873799 (98) total: 10m 46s remaining: 15m 56s
130: learn: 0.6676020 test: 0.8197830 test1: 0.6869205 best: 0.6873799 (98) total: 11m 39s remaining: 15m 2s
140: learn: 0.6679969 test: 0.8199328 test1: 0.6869941 best: 0.6873799 (98) total: 12m 29s remaining: 14m 4s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.6873798906
bestIteration = 98
Shrink model to first 99 iterations.
================================================================================
==================================5==========================================
Learning rate set to 0.235227
0: learn: 0.6485131 test: 0.6517068 test1: 0.6479244 best: 0.6479244 (0) total: 5.08s remaining: 25m 17s
10: learn: 0.6395082 test: 0.7089322 test1: 0.6549795 best: 0.6549795 (10) total: 1m 2s remaining: 27m 9s
20: learn: 0.6457125 test: 0.7400387 test1: 0.6659492 best: 0.6659492 (20) total: 1m 53s remaining: 25m 11s
30: learn: 0.6514184 test: 0.7604515 test1: 0.6730497 best: 0.6730497 (30) total: 2m 44s remaining: 23m 43s
40: learn: 0.6546387 test: 0.7762775 test1: 0.6760438 best: 0.6763032 (37) total: 3m 29s remaining: 22m 6s
50: learn: 0.6567014 test: 0.7823814 test1: 0.6762968 best: 0.6766550 (46) total: 4m 19s remaining: 21m 5s
60: learn: 0.6586215 test: 0.7945281 test1: 0.6778286 best: 0.6778465 (58) total: 5m 10s remaining: 20m 16s
70: learn: 0.6601434 test: 0.7992026 test1: 0.6820142 best: 0.6820142 (70) total: 6m 2s remaining: 19m 28s
80: learn: 0.6620918 test: 0.8051808 test1: 0.6835441 best: 0.6835441 (80) total: 6m 51s remaining: 18m 33s
90: learn: 0.6632307 test: 0.8095564 test1: 0.6847451 best: 0.6852213 (89) total: 7m 43s remaining: 17m 43s
100: learn: 0.6646119 test: 0.8102420 test1: 0.6855051 best: 0.6855051 (100) total: 8m 35s remaining: 16m 56s
110: learn: 0.6661142 test: 0.8136297 test1: 0.6855520 best: 0.6858795 (107) total: 9m 34s remaining: 16m 17s
120: learn: 0.6664666 test: 0.8142156 test1: 0.6856690 best: 0.6858795 (107) total: 10m 27s remaining: 15m 28s
130: learn: 0.6671449 test: 0.8158779 test1: 0.6858933 best: 0.6858933 (130) total: 11m 19s remaining: 14m 37s
140: learn: 0.6679393 test: 0.8168439 test1: 0.6859532 best: 0.6860749 (134) total: 12m 12s remaining: 13m 45s
150: learn: 0.6682700 test: 0.8174497 test1: 0.6852882 best: 0.6861378 (145) total: 13m 3s remaining: 12m 52s
160: learn: 0.6687193 test: 0.8181760 test1: 0.6852596 best: 0.6861378 (145) total: 13m 53s remaining: 11m 59s
170: learn: 0.6690934 test: 0.8181678 test1: 0.6847725 best: 0.6861378 (145) total: 14m 46s remaining: 11m 8s
180: learn: 0.6698601 test: 0.8202472 test1: 0.6854161 best: 0.6861378 (145) total: 15m 38s remaining: 10m 17s
190: learn: 0.6703541 test: 0.8222802 test1: 0.6853611 best: 0.6861378 (145) total: 16m 33s remaining: 9m 26s
Stopped by overfitting detector (50 iterations wait)
bestTest = 0.6861377913
bestIteration = 145
Shrink model to first 146 iterations.
================================================================================
Wall time: 4h 33min 14s
sub = np.zeros([46404, 2])
for i in result:
sub += i
pred_prob = sub/5
temp = (pd.DataFrame(pred_prob) > 0.4)
pred = temp.iloc[:,1].astype('int')
submission = pd.read_csv('sample_submission.csv')
submission['target'] = pred
submission.to_csv('Kfold_Catboost.csv', index=False)