β Google colab μ°λ / Data load
## GOOGLE COLAB μ°λ
from google.colab import drive
drive.mount('/content/drive')
# Mounted at /content/drive
## DATA LOAD
import pandas as pd
print('pandas version:', pd.__version__)
wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.head()
alcohol sugar pH class
0 9.4 1.9 3.51 0.0
1 9.8 2.6 3.20 0.0
2 9.8 2.3 3.26 0.0
3 9.8 1.9 3.16 0.0
4 9.4 1.9 3.51 0.0
π numpy - μμΉ μ°μ°
# numpy - μμΉ μ°μ° (νλ ¬λ‘ λ³ν)
import numpy as np
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()
data.shape, target.shape
# ((6497, 3), (6497,))
β DATASET λΆλ¦¬
## λ°μ΄ν°μ
λΆλ¦¬
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
data, target, test_size = 0.2, random_state=42
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
# ((5197, 3), (1300, 3), (5197,), (1300,))
π Feature Engineering
β· μμΉ λ°μ΄ν° μ κ·ν
- μμΉ λ°μ΄ν°λ§λ€ λ¨μκ° λ€ λ€λ₯΄λ€
ex. ν€, λͺΈλ¬΄κ² - 180cm, 1.8m, 180kg => μλͺ»λ μ 보 νμ΅ν κ°λ₯μ± μμ (μ κ·ν νμ)
- Min-Max Scaler : 0 - 1 μ¬μ΄λ‘ λΆν¬λ₯Ό μ¬μ 립
- Standerd Scaler : νκ· μ 0, λΆμ°μ 1λ‘ μ‘κ³ μΆμ
- μ κ·νλ₯Ό ν΅ν΄ μμΉ λ³ν μ, μλμ κ° μ±μ§μ μμ΄λ²λ¦¬κ³ λ¬λΌμ§λλΌλ λ°μ΄ν°μ μμλ λ³λμμ(β β β )
β· κΈ°λ³Έ μμΉ (β β β )
- train, test data λ λΆλ¦¬λμ΄μΌ νλ€
- λ΄κ° μ§ μ½λμ train, test data κ° μμ΄μ§λ μμλμ§ μμ¬νλ€
- Data Leakage μμ λ²μ΄λλ λ°©ν₯μΌλ‘ κ³ λ―Όνλ€
β Train Data Scaling
# train data scaling
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)
## train data νμ΅νμ¬ X_train / X_testμ μ μ©
trained_scaled = ss.transform(X_train)
test_scaled = ss.transform(X_test)
trained_scaled.shape, test_scaled.shape
# ((5197, 3), (1300, 3))
## λ
립λ³μ λ§λ€μ΄μ§λ κ² νμΈ
X_train[0], trained_scaled[0]
# (array([10.5 , 7.7 , 3.19]), array([ 0.01376574, 0.4646116 , -0.16888369]))
π λ³νλ κ°μ μ μ©νλ€λ μλ―Έ (train_scaled)
π train_scaled μ μ©νμ¬ μλ‘μ΄ test datasetμΌλ‘ λ°κΎΌ ν predict
β Modeling
# λ
립λ³μ λ§λ μ΄ν λͺ¨λΈλ§ μμ
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_scaled, y_train)
print(dt.score(train_scaled, y_train)) # νλ ¨ νκ° 99.69 %
print(dt.score(test_scaled, y_test)) # κ²μ¦ νκ° 85.92 %
# train - test λ μ¬μ΄μ μ°¨μ΄κ° λ§μ΄ λκΈ°λλ¬Έμ, overfitting μ΄ μΌμ΄λ¬λ€κ³ νλ¨νλ€
# train - test μ¬μ΄μ μ°¨μ΄λ₯Ό μ€μ¬μ£Όλ κ²μ΄ μ€μ!!
# overfitting μ΄ μΌμ΄λμ§ μλλ‘
# μ΄νμ νμλ³μ λ§λλ μΈμ
π Decision Tree
# κ²°μ νΈλ¦¬ κ²°κ³Όκ°
# hyperparameter μ€μ μμ΄ μ λΆ λ릴 λ
# λͺ¨λ 쑰건μ λ§κ², train dataμ μ΅μ ν λλλ‘ (μ€λ κ±Έλ¦Ό)
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure()
plot_tree(dt)
plt.show()
π Max_Depth Adjustment
- max_depth κ° μμμλ‘ μ νλλ λ¨μ΄μ§
π κ³Όμμ ν©μ΄ μΌμ΄λκ² λλ€ (νλ ¨ < κ²μ¦)
# max_depth μ‘°μ
plt.figure(figsize=(10,7))
plot_tree(dt, max_depth=1, filled=True, feature_names=['alcohol', 'sugar', 'pH'])
plt.show()
# max_depth μ‘°μ
plt.figure(figsize=(10,7))
plot_tree(dt, max_depth=2, filled=True, feature_names=['alcohol', 'sugar', 'pH'])
plt.show()
## μ’μ λͺ¨λΈμ΄ λλλ‘ max_depth κ°μ μ‘°μ νμ¬ μ°Ύμλ
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=1, random_state=42)
dt.fit(train_scaled, y_train)
print(dt.score(train_scaled, y_train)) # νλ ¨ νκ° 75.3 %
print(dt.score(test_scaled, y_test)) # κ²μ¦ νκ° 73.7 %
plt.figure(figsize=(10,7))
plot_tree(dt, max_depth=3, filled=True, feature_names=['alcohol', 'sugar', 'pH'])
plt.show()
# 0.7579372715027901
# 0.7376923076923076
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=3, random_state=42)
dt.fit(train_scaled, y_train)
print(dt.score(train_scaled, y_train)) # νλ ¨ νκ° 84.5 %
print(dt.score(test_scaled, y_test)) # κ²μ¦ νκ° 84.1 %
plt.figure(figsize=(10,7))
plot_tree(dt, max_depth=3, filled=True, feature_names=['alcohol', 'sugar', 'pH'])
plt.show()
# 0.8454877814123533
# 0.8415384615384616
π Overfitting μ΄ μΌμ΄λμ§ μμμμ μ μ μμ
'Machine Learning' μΉ΄ν κ³ λ¦¬μ λ€λ₯Έ κΈ
[Machine Learning] Poisson Regression (0) | 2023.04.26 |
---|---|
[Machine Learning] XGBoost (Extreme Gradient Boosting) (0) | 2023.04.26 |
[Machine learning] scikit-learn pipeline (0) | 2023.04.21 |
[Machine Learning] Data Leakage (0) | 2023.04.21 |
[Machine Learning] Hyperparameter Tuning (0) | 2023.04.21 |