์นดํ
๊ณ ๋ฆฌ ์์
[kaggle] ์บ๊ธ ํ์ฌ - Parkinson's Disease Progression Prediction 1
๊ฐ๋ฐ์์ด๋ฌด
2023. 4. 27. 10:37
๋ฐ์ํ
โ Kaggle Competition - AMPยฎ-Parkinson's Disease Progression Prediction
https://www.kaggle.com/competitions/amp-parkinsons-disease-progression-prediction
AMPยฎ-Parkinson's Disease Progression Prediction | Kaggle
www.kaggle.com
โ The Goal of Competition - ํํจ์จ๋ณ ํ์์ ์งํ์ ์ธก์ ํ๋ MDS-UPDR ์ ์๋ฅผ ์์ธกํ๋ ๊ฒ
โ ํํจ์จ๋ณ ํ์์ ์ฐ๋ น์ด ์ผ์นํ๋ ์ ์ ๋์กฐ๊ตฐ์ ์๊ฐ ๊ฒฝ๊ณผ์ ๋ฐ๋ฅธ
๋จ๋ฐฑ์ง ๋ฐ ํฉํ์ด๋ ์์น ๋ฐ์ดํฐ๋ก ํ๋ จ๋ ๋ชจ๋ธ์ ๊ฐ๋ฐ
โ Copying in kaggle New Notebook
โ Copying Notebook : Simple Linear model with only clinical data
# write a read-only input directory for datasets
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv
โ Loading Libraries
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import tqdm # ์๊ฐ์ด ๊ฑธ๋ฆฌ๋ ์์
์ ๊ฒฝ์ฐ ์ํ ํ์ธ์ ์ํด ์ฌ์ฉ
import re # ์ ๊ทํํ ์ฒ๋ฆฌ๋ฅผ ํ๊ธฐ ์ํด ํ์ค ๋ผ์ด๋ธ๋ฌ๋ฆฌ
from itertools import product # ํจ์จ์ ์ธ ๋ฃจํ์ ์ํ ์ํ ์ดํฐ๋ ์ดํฐ๋ฅผ ๋ง๋๋ ํจ์
from functools import reduce # ๊ณ ์ฐจ์ ํจ์๋ฅผ ์ํด ๊ณ ์
import warnings # ๊ฒฝ๊ณ ์ ์ด
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
# ์์์ ์ดํ 3์๋ฆฌ๋ง ํํ
pd.set_option('display.float_format',lambda x: '%.3f' % x)
โ Reading the Datasets
# Reading the datasets
protein_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
peptides_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
target_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
sup_target_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
protein_data.shape, peptides_data.shape, target_data.shape
## ((232741, 5), (981834, 6), (2615, 8))
โ target data + sup_target_data (Merging)
๐ ๋ชจ๋ธ๋ง์ ์์ ๋ฐ์ดํฐ (train_clinical_data = target_data)๋ง ์ฌ์ฉํ ๊ฒ์ด๋ฏ๋ก,
target_data ์ sup_target_data ๋ณํฉํ๊ธฐ
# Merging target data and sup_target_data since we will be using only clinical data for modelling
target_data = pd.concat([target_data,sup_target_data],axis = 0).reset_index(drop = True) # ๋ณํฉ
target_data = target_data[target_data.visit_month != 5].copy() # ์๋ฏธ์๋ ๋ฐ์ดํฐ ์ ๊ฑฐ
# ์ ์ผ๊ฐ(unique) ํ์ธ
target_data.shape, target_data.visit_id.nunique(), target_data.patient_id.nunique(), target_data.visit_month.nunique()
## ((4720, 8), 4720, 1019, 17)
โ target_data ๊ฒฐ์ธก์น ์ฑ์ฐ๊ธฐ
๐ upd23b_clinical_state_on_medication ์ปฌ๋ผ์ ๊ฒฐ์ธก์น๋ฅผ unknown์ผ๋ก ์ฑ์
# ๊ฒฐ์ธก์น ์ฑ์ฐ๊ธฐ
# upd23b_clinical_state_on_medication ์์ ๊ฒฐ์ธก์น๋ฅผ unknown์ผ๋ก
target_data.upd23b_clinical_state_on_medication.fillna('unknown',inplace = True)
# target_data ๊ฒฐ์ธก์น ํ์ธ
target_data.isna().sum()
# visit_id 0
# patient_id 0
# visit_month 0
# updrs_1 96
# updrs_2 98
# updrs_3 30
# updrs_4 1863
# upd23b_clinical_state_on_medication 0
# dtype: int64
โ ์นดํ ๊ณ ๋ฆฌ ์ปฌ๋ผ ์ค์
# Category columns
id_cols = ['visit_id','patient_id','visit_month']
target_cols = ['updrs_1','updrs_2','updrs_3','updrs_4']
month_list = [0,6,12,24]
โ create_target_features ํจ์ ์์ฑ (by target data)
def create_target_features(target_data,test=None):
# test data๊ฐ ์๋ค๋ฉด, target data์ 'patient_id','visit_month','upd23b_clinical_state_on_medication'๋ง
# ํฌํจ๋ dataframe ์์ฑ
if test is None:
test1 = target_data[['patient_id','visit_month','upd23b_clinical_state_on_medication']].copy()
test1['upd23b_clinical_state_on_medication']
= test1['upd23b_clinical_state_on_medication'].fillna('unknown')
# test data๊ฐ ์๋ค๋ฉด, test data ๋ณต์ฌํ์ฌ 'upd23b_clinical_state_on_medication' ์นผ๋ผ์ด ์๋ค๋ฉด
# 'unknown' ๊ฐ์ ์ถ๊ฐํ๊ณ ๊ฒฐ์ธก๊ฐ์ 'unknown'์ผ๋ก ๋์ฒด
else:
test1 = test.copy()
if 'upd23b_clinical_state_on_medication' not in test1.columns.tolist():
test1['upd23b_clinical_state_on_medication'] = 'unknown'
else:
test1['upd23b_clinical_state_on_medication']
= test1['upd23b_clinical_state_on_medication'].fillna('unknown')
# 'patient_id', 'visit_month', 'upd23b_clinical_state_on_medication' ์นผ๋ผ๋ง ๋จ๊ธฐ๊ณ ์ค๋ณต์ ์ ๊ฑฐ
test1 = test1[['patient_id','visit_month',
'upd23b_clinical_state_on_medication']].drop_duplicates().copy()
# patient_id / visit_month ์ ์ผ๊ฐ ์ ์ฅ
patients = test1.patient_id.unique()
visit_months = test1.visit_month.unique()
# test data ์
๋ ฅ ์, ๋น๊ต๋ฅผ ์ํด original data ์ปฌ๋ผ ์ค์
test1['visit_month_orig'] = test1['visit_month']
# test data๊ฐ ์ ๊ณต๋๋ฉด ์ถ๊ฐ ๋ฐ์ดํฐ ์ฒ๋ฆฌ๋ฅผ ์ํํ์ฌ
# ๊ฐ ํ์์ ๋ชจ๋ ๊ด๋ จ ๋ฐฉ๋ฌธ ์์ ํฌํจํ๋๋ก ๋ฐ์ดํฐ๋ฅผ ํ์ฅ
# ๋ฐ์ดํฐ์ ๊ฐ ํ์๋ฅผ ๋ฐ๋ณตํ๊ณ ๊ฐ ํ์์ ๋ํด ๊ฐ ๋ฐฉ๋ฌธ ์(0, 6, 12, 24๊ฐ์)์ ๋ํ
# ํ์ ๋ฐ์ดํฐ์ ์ฌ๋ณธ์ ์์ฑ ์ดํ ๋ฐ์ดํฐ ๋ณต์ฌ๋ณธ ์ฐ๊ฒฐํ์ฌ test1 data ์์ฑ
test_data = pd.DataFrame()
for patient in patients:
for month in [0,6,12,24]:
p_data = test1[test1['patient_id'] == patient].copy()
p_data['visit_month'] = p_data['visit_month'].values + month
test_data = pd.concat([test_data,p_data],axis=0)
test1 = test_data.copy()
return test1
# visit_month & medication ๊ฒฐ์ธก์น ๋์ฒด
target_data1 = target_data.copy()
tmeds = target_data1.groupby(['visit_month','upd23b_clinical_state_on_medication']
,group_keys =False)[target_cols].apply(lambda x: x.fillna(x.median())).sort_index()
target_data1 = target_data1[['patient_id','visit_month','upd23b_clinical_state_on_medication']].join(tmeds)
# grp features ์์ฑ
# ๊ฐ visit_month์ medication ๊ทธ๋ฃน์ ๋ํด updrs_1 ์ปฌ๋ผ์ ๊ฐ์(count)๋ฅผ ์์ฑ
all_grp_cols = [['visit_month'],['upd23b_clinical_state_on_medication']]
+ [['visit_month','upd23b_clinical_state_on_medication']]
target_data2 = target_data1[['visit_month','upd23b_clinical_state_on_medication']].drop_duplicates()
for grp_col in all_grp_cols:
temp = target_data1.groupby(grp_col)[target_cols].agg(['min','max','mean','median','sum','std'])
temp.columns = [i+'_' + j + '_'+ '_'.join(grp_col) for i,j in temp.columns]
target_data2 = target_data2.join(temp,on = grp_col)
target_data2 = target_data2.join(target_data1.groupby(grp_col)['updrs_1'].count().rename('_'.join(grp_col)+'_count')
, on = grp_col)
# 'med_unknown', 'med_off', 'med_on' ์ปฌ๋ผ ์ถ๊ฐ
# 'upd23b_clinical_state_on_medication' ๊ฐ์ด ๊ฐ๊ฐ 'unknown', 'Off', 'On'์ธ ๊ฒฝ์ฐ 1, ๊ทธ๋ ์ง ์์ ๊ฒฝ์ฐ 0
test1['med_unknown'] = (test1['upd23b_clinical_state_on_medication'] == 'unknown').astype(int)
test1['med_off'] = (test1['upd23b_clinical_state_on_medication'] == 'Off').astype(int)
test1['med_on'] = (test1['upd23b_clinical_state_on_medication'] == 'On').astype(int)
model_data = test1.merge(target_data2,on = ['visit_month','upd23b_clinical_state_on_medication']
, how = 'inner').drop(columns = 'upd23b_clinical_state_on_medication')
feature_cols = model_data.drop(columns = ['patient_id']).columns.tolist()
# model_data์ visit_month๋ฅผ ๊ธฐ์ค์ผ๋ก ์ค์๊ฐ์ผ๋ก ๋์ฒด
model_data = model_data.groupby('visit_month',group_keys = False).apply(lambda x: x.fillna(x.median()))
# test data๊ฐ ์๋ค๋ฉด 'patient_id', 'visit_month', target_cols ์ปฌ๋ผ์ด ์ถ๊ฐ
if test is None:
model_data = model_data.merge(target_data1[['patient_id','visit_month'] + target_cols],
on = ['patient_id','visit_month'],
how = 'inner')
return model_data,feature_cols
โ feature ๊ฐ์ ธ์ค๊ธฐ (by create_target_features)
# Getting the features
model_data, feature_cols = create_target_features(target_data)
model_data.shape, len(feature_cols)
## ((4720, 84), 79)
# ์ค๋ณต๊ฐ ์ ๊ฑฐ
model_data[['patient_id','visit_month']].drop_duplicates().shape, model_data.shape
## ((4720, 2), (4720, 84))
โ feature_cols ์ค์
# feature_cols = ['visit_month','med_unknown','med_off','med_on','visit_month_count',
# 'upd23b_clinical_state_on_medication_count','visit_month_upd23b_clinical_state_on_medication_count']
feature_cols = ['visit_month','visit_month_count']
๋ฐ์ํ