from IPython.core.display import display, HTML
display(HTML("<style> .container{width:90% !important;}</style>"))

1. 배깅(Bagging)이란?¶

배깅(Bagging)은 Bootstrap Aggregating의 약자로, 보팅(Voting)과는 달리 동일한 알고리즘으로 여러 분류기를 만들어 보팅으로 최종 결정하는 알고리즘

**배깅은 다음과 같은 방식으로 진행이 됩니다.

(1) 동일한 알고리즘을 사용하는 일정 수의 분류기 생성
(2)각각의 분류기는 부트스트래핑(Bootstrapping)방식으로 생성된 샘플데이터를 학습
(3)최종적으로 모든 분류기가 보팅을 통헤 예측 결정

※ 부트스트래핑 샘플링은 전체 데이터에서 일부 데이터의 중첩을 허용하는 방식

2. 랜덤포레스트(RandomForest)¶

랜덤 포레스트는 여러 개의 결정트리(Decision Tree)를 활용한 배깅 방식의 대표적인 알고리즘

장점¶

결정 트리의 쉽고 직관적인 장점을 그대로 가지고 있음
앙상블 알고리즘 중 비교적 빠른 수행 속도를 가지고 있음
다양한 분야에서 좋은 성능을 나타냄

단점¶

하이퍼 파라미터가 많아 튜닝을 위힌 시간이 많이 소요됨

사용자 행동 데이터 세트를 이용한 RandomForest 예측¶

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

# 데이터셋을 구성하는 함수 설정
def get_human_dataset():
    
    # 각 데이터 파일들은 공백으로 분리되어 있으므로 read_csv에서 공백문자를 sep으로 할당
    feature_name_df = pd.read_csv('human_activity/features.txt', sep='\s+',
                                                     header=None, names=['column_index', 'column_name'])
    # 데이터프레임에 피처명을 컬럼으로 뷰여하기 위해 리스트 객체로 다시 반환
    feature_name = feature_name_df.iloc[:, 1].values.tolist()
    
    # 학습 피처 데이터세트와 테스트 피처 데이터를 데이터프레임으로 로딩
    # 컬럼명은 feature_name 적용
    X_train = pd.read_csv('human_activity/train/X_train.txt', sep='\s+', names=feature_name)
    X_test = pd.read_csv('human_activity/test/X_test.txt', sep='\s+', names=feature_name)
    
    # 학습 레이블과 테스트 레이블 데이터를 데이터 프레임으로 로딩, 컬럼명은 action으로 부여
    y_train = pd.read_csv('human_activity/train/y_train.txt', sep='\s+', names=['action'])
    y_test = pd.read_csv('human_activity/test/y_test.txt', sep='\s+', names=['action'])
    
    # 로드된 학습/테스트용 데이터프레임을 모두 반환
    return X_train, X_test, y_train, y_test

# 학습/테스트용 데이터 프레임 반환
X_train, X_test, y_train, y_test = get_human_dataset()

# 랜덤 포레스트 학습 및 별도의 테스트 세트로 예측 성능 평가
rf_clf = RandomForestClassifier(random_state=0)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('랜덤 포레스트 정확도: {:.4f}'.format(accuracy))

랜덤 포레스트 정확도: 0.9108

3. 랜덤포레스트 하이퍼 파라미터 튜닝¶

랜덤포레스트는 트리기반의 하이퍼 파라미터에 배깅, 부스팅, 학습, 정규화 등을 위한 하이퍼 파라미터까지 추가되므로 튜닝할 파라미터가 많습니다.

파라미터 명	설명
n_estimators	- 결정트리의 갯수를 지정 - Default = 10 - 무작정 트리 갯수를 늘리면 성능 좋아지는 것 대비 시간이 걸릴 수 있음
min_samples_split	- 노드를 분할하기 위한 최소한의 샘플 데이터수 → 과적합을 제어하는데 사용 - Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가
min_samples_leaf	- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수 - min_samples_split과 함께 과적합 제어 용도 - 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요
max_features	- 최적의 분할을 위해 고려할 최대 feature 개수 - Default = 'auto' (결정트리에서는 default가 none이었음) - int형으로 지정 →피처 갯수 / float형으로 지정 →비중 - sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정 - log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정
max_depth	- 트리의 최대 깊이 - default = None → 완벽하게 클래스 값이 결정될 때 까지 분할 또는 데이터 개수가 min_samples_split보다 작아질 때까지 분할 - 깊이가 깊어지면 과적합될 수 있으므로 적절히 제어 필요
max_leaf_nodes	리프노드의 최대 개수

# RandomForest의 하이퍼 파라미터 default 상태
model = RandomForestClassifier()
model

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

GridSearchCV를 통한 랜덤포레스트의 하이퍼 파라미터 튜닝¶

from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고 예측 정확도: 0.9206

#위의 결과로 나온 최적 하이퍼 파라미터로 다시 모델을 학습하여 테스트 세트 데이터에서 예측 성능을 측정
rf_clf1 = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 8,
                                random_state = 0,
                                n_jobs = -1)
rf_clf1.fit(X_train, y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,pred)))

예측 정확도: 0.9230

Random Forest의 각 피처의 중요도 시각화 : featureimportances¶

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

자료출처 : https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

from IPython.core.display import display, HTML
display(HTML("<style> .container{width:90% !important;}</style>"))

Handling Missing Data¶

import pandas as pd
import numpy as np

# 임의의 데이터프레임 생성
df = pd.DataFrame( [ [np.nan, 2, np.nan, 0], 
                     [3, 4, np.nan, 1],
                     [np.nan, np.nan, np.nan, 5] ],
                                 columns = list('ABCD') )
df

dropna(axis = ?, how = ?, inplace = ?) : 결측치 버리기

axis = 0 (default) : 행 버리기 ⟷ axis = 1 : 컬럼 버리기
how = 'any' (default) : 행 또는 열의 NaN이 하나라도 있을 때 버리기 ⟷ how = 'all' : 전체 행 또는 열의 값이 NaN일 때 버리기
inplace = False (default) : drop한 결과 조회만 하기 ⟷ inplace = True : drop한 결과 데이터프레임에 바로 저장

# 전부다 Null인 컬럼 drop
df.dropna(axis=1, how= 'all')

# 하나라도 Null이 있는 컬럼 drop
df.dropna(axis=1, how= 'any')

# 전부다 Null인 열 drop
df.dropna(axis=0, how='all')

# 하나라도 Null이 있는 열 drop
df.dropna(axis=0, how='any')

df.fillna : NaN 을 지정해준 값으로 채워줌

# 결측치 0으로 채우기
df.fillna(0)

# 딕셔너리를 사용해서 컬럼별로 지정값으로 채우기
values = { 'A' : 0, 'B': 1, 'C': 2, 'D': 3}
df.fillna(value=values)

# 결측치를 중앙값으로 채우기
fill_na_value = df['D'].median()
df.fillna(fill_na_value)

# 컬럼별로 결측치 데이터 갯수 확인
df.isnull().sum()

A    2
B    1
C    3
D    0
dtype: int64

# 컬럼별로 결측치가 아닌 데이터 갯수 확인
df.notnull().sum()

A    1
B    2
C    0
D    3
dtype: int64

자료출처 : https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

from IPython.core.display import display, HTML
display(HTML("<style> .container{width:90% !important;}</style>"))

Summarize Data¶

import pandas as pd
import seaborn as sns
import numpy as np

# 예제 데이터셋 불러오기
df = sns.load_dataset('iris')
df.shape

(150, 5)

df.head()

# 카테고리형 변수가 각 값별로 데이터가 얼마나 있는지 확인
df['species'].value_counts()

virginica     50
setosa        50
versicolor    50
Name: species, dtype: int64

# value_counts() 한 것을 데이터 프레임으로 넣기
df['species'].value_counts().to_frame()
# pd.DataFrame(df['species'].value_counts()) 도 같은 결과

# 데이터 프레임의 행수 확인
len(df)
# df.shape[0] 도 같은 결과

150

# 변수의 유니크한 값 개수
df['species'].nunique()

3

# describe()를 사용해서 기본 통계값들을 확인할 수 있다.
df.describe()

# include = 'all' 인자를 넣어주면 categorical 변수에 대한 값인 unique, top, freq도 조회할 수 있음
df.describe(include='all')

# 문자형(카테고리형) 변수에 대한 통계값을 조회할 수 있음
df.describe(include=[np.object])

# 수치형 변수에 대한 통계값을 조회할 수 있음
df.describe(include=[np.number])

# 해당 컬럼의 값 합계
df['petal_width'].sum()

179.90000000000003

# 해당 컬럼의 null이 아닌 행의 수
df['petal_width'].count()

150

# 해당 컬럼의 중위수
df['petal_width'].median()

1.3

# 해당 컬럼의 평균값
df['petal_width'].mean()

1.199333333333334

# 데이터 프레임 각 컬럼의 평균값
df.mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

# 4분위수 확인하기
df.quantile([0.25, 0.75])

# 데이터 프레임 각 컬럼의 최댓값
df.max()

sepal_length          7.9
sepal_width           4.4
petal_length          6.9
petal_width           2.5
species         virginica
dtype: object

# 데이터 프레임 각 컬럼의 최솟값
df.min()

sepal_length       4.3
sepal_width          2
petal_length         1
petal_width        0.1
species         setosa
dtype: object

# 데이터 프레임 각 컬럼의 분산
df.var()

sepal_length    0.685694
sepal_width     0.189979
petal_length    3.116278
petal_width     0.581006
dtype: float64

# 데이터 프레임의 각 컬럼의 표준편차
df.std()

sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64

apply(function)¶

# 임의의 함수 설정
def smp(x):
    # 뒤에서 세번째까지의 문자를 가져오는 함수
    x = x[-3:]
    return x

# lambda 익명함수 적용
df['species_3'] = df['species'].apply(lambda x : x[:3])

# 설정해둔 함수 적용
df['species_4'] = df['species'].apply(smp)

df.head()

[Chapter 4. 분류] XGBoost(eXtraGradient Boost) (3)	2019.10.27
[Chapter 4. 분류] 부스팅알고리즘(AdaBoost, GBM) (0)	2019.10.20
[Chapter 4. 분류] 앙상블 학습 (0)	2019.10.14
[Chapter 4. 분류] Decision Tree Classifier (0)	2019.10.03
[Chapter 3. 평가] 피마 인디언 당뇨병 데이터셋을 통한 평가지표 실습 (0)	2019.10.03

ML History 4 : Decision Tree (0)	2019.10.20
ML History 3 : Neural Networks (0)	2019.10.20
ML History 1 : Linear Regression (0)	2019.10.18
Supervised Learning 2 : Regression and Classification (0)	2019.10.15
Supervised Learning 1 :Supervised Learning (0)	2019.10.15

ML History 3 : Neural Networks (0)	2019.10.20
ML History 2 : Perceptron (0)	2019.10.18
Supervised Learning 2 : Regression and Classification (0)	2019.10.15
Supervised Learning 1 :Supervised Learning (0)	2019.10.15
Introduction to Launching into ML (0)	2019.10.15

	A	B	C	D
0	0.0	2.0	2.0	0
1	3.0	4.0	2.0	1
2	0.0	1.0	2.0	5

	A	B	C	D
0	1.0	2.0	1.0	0
1	3.0	4.0	1.0	1
2	1.0	1.0	1.0	5

데이터분석, 머신러닝 정리 노트

전체 글

[Chapter 4. 분류] 랜덤포레스트(Random Forest)

1. 배깅(Bagging)이란?¶

2. 랜덤포레스트(RandomForest)¶

장점¶

단점¶

사용자 행동 데이터 세트를 이용한 RandomForest 예측¶

3. 랜덤포레스트 하이퍼 파라미터 튜닝¶

GridSearchCV를 통한 랜덤포레스트의 하이퍼 파라미터 튜닝¶

Random Forest의 각 피처의 중요도 시각화 : featureimportances¶

'Machine Learning > 파이썬 머신러닝 완벽가이드 학습' 카테고리의 다른 글

ML History 2 : Perceptron

'구글 머신러닝 스터디잼(중급) > Launching into Machine Learning' 카테고리의 다른 글

ML History 1 : Linear Regression

'구글 머신러닝 스터디잼(중급) > Launching into Machine Learning' 카테고리의 다른 글

5. Handling Missing Data(결측치 다루기)

Handling Missing Data¶

'Python > Pandas Cheat Sheet' 카테고리의 다른 글

4. Summarize Data(자료 요약하기)

Summarize Data¶

apply(function)¶

'Python > Pandas Cheat Sheet' 카테고리의 다른 글

Supervised Learning 2 : Regression and Classification

'구글 머신러닝 스터디잼(중급) > Launching into Machine Learning' 카테고리의 다른 글

Supervised Learning 1 :Supervised Learning

'구글 머신러닝 스터디잼(중급) > Launching into Machine Learning' 카테고리의 다른 글

Introduction to Launching into ML

'구글 머신러닝 스터디잼(중급) > Launching into Machine Learning' 카테고리의 다른 글

+ Recent posts

티스토리툴바

4. Summarize Data(자료 요약하기) (0)	2019.10.16
3. Subset Observations(Columns) (열 데이터 다루기) (0)	2019.10.09
2. Subset Observations(Rows) (행 데이터 다루기) (0)	2019.10.09
1. Creating DataFrame(데이터프레임 만들기) (0)	2019.10.09

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	sepal_length	sepal_width	petal_length	petal_width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.057333	3.758000	1.199333
std	0.828066	0.435866	1.765298	0.762238
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000

5. Handling Missing Data(결측치 다루기) (0)	2019.10.16
3. Subset Observations(Columns) (열 데이터 다루기) (0)	2019.10.09
2. Subset Observations(Rows) (행 데이터 다루기) (0)	2019.10.09
1. Creating DataFrame(데이터프레임 만들기) (0)	2019.10.09

	sepal_length	sepal_width	petal_length	petal_width
0.25	5.1	2.8	1.6	0.3
0.75	6.4	3.3	5.1	1.8