import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# https://www.kaggle.com/datasets/ayessa/salary-prediction-classification
url = 'https://raw.githubusercontent.com/dev-EthanJ/scikit-learn_Machine_Learning/main/data/salary.csv'

df = pd.read_csv(url, skipinitialspace=True)

df.head()


df.tail()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       46043 non-null  object
 2   education       48842 non-null  object
 3   education-num   48842 non-null  int64 
 4   marital-status  48842 non-null  object
 5   occupation      46033 non-null  object
 6   relationship    48842 non-null  object
 7   race            48842 non-null  object
 8   sex             48842 non-null  object
 9   capital-gain    48842 non-null  int64 
 10  capital-loss    48842 non-null  int64 
 11  hours-per-week  48842 non-null  int64 
 12  native-country  47985 non-null  object
 13  class           48842 non-null  object
dtypes: int64(5), object(9)
memory usage: 5.2+ MB


df.describe()


df.describe(include=['O'])


df.describe(include='all')


df['class']

0        <=50K
1        <=50K
2         >50K
3         >50K
4        <=50K
         ...  
48837    <=50K
48838     >50K
48839    <=50K
48840    <=50K
48841     >50K
Name: class, Length: 48842, dtype: object


df['class'].value_counts()

<=50K    37155
>50K     11687
Name: class, dtype: int64


class_data = df['class'].unique()

class_data

array(['<=50K', '>50K'], dtype=object)


df['class'] = df['class'].map({class_data[0]:0, class_data[1]:1})

print(df['class'].unique())
print(df['class'].isnull().sum())

[0 1]
0


df['sex'].unique()

array(['Male', 'Female'], dtype=object)


df['sex'].isnull().sum()

0


df['sex'] = pd.get_dummies(df['sex'], drop_first=True)

df['sex'].head()

0    1
1    1
2    1
3    1
4    0
Name: sex, dtype: uint8


obj_list = [(col, df[col].nunique()) for col in df.columns if df[col].dtype == 'object']

obj_list

[('workclass', 8),
 ('education', 16),
 ('marital-status', 7),
 ('occupation', 14),
 ('relationship', 6),
 ('race', 5),
 ('native-country', 41)]


for obj in obj_list:
    if obj[1] > 10:
        print(obj)

('education', 16)
('occupation', 14)
('native-country', 41)


edu_sr = df['education'].value_counts()
edu_sr

HS-grad         15784
Some-college    10878
Bachelors        8025
Masters          2657
Assoc-voc        2061
11th             1812
Assoc-acdm       1601
10th             1389
7th-8th           955
Prof-school       834
9th               756
12th              657
Doctorate         594
5th-6th           509
1st-4th           247
Preschool          83
Name: education, dtype: int64


edunum_sr = df['education-num'].value_counts()
edunum_sr

9     15784
10    10878
13     8025
14     2657
11     2061
7      1812
12     1601
6      1389
4       955
15      834
5       756
8       657
16      594
3       509
2       247
1        83
Name: education-num, dtype: int64


for num in df['education-num'].unique():
    print(num, df[df['education-num'] == num]['education'].unique())

7 ['11th']
9 ['HS-grad']
12 ['Assoc-acdm']
10 ['Some-college']
6 ['10th']
15 ['Prof-school']
4 ['7th-8th']
13 ['Bachelors']
14 ['Masters']
16 ['Doctorate']
3 ['5th-6th']
11 ['Assoc-voc']
5 ['9th']
8 ['12th']
2 ['1st-4th']
1 ['Preschool']


df = df.drop('education', axis=1)

df.head()


df['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64


df['native-country'].value_counts()

United-States                 43832
Mexico                          951
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                             46
Ecuador                          45
France                           38
Ireland                          37
Hong                             30
Thailand                         30
Cambodia                         28
Trinadad&Tobago                  27
Yugoslavia                       23
Outlying-US(Guam-USVI-etc)       23
Laos                             23
Scotland                         21
Honduras                         20
Hungary                          19
Holand-Netherlands                1
Name: native-country, dtype: int64


# 'native-country' column 기준 조회: df.groupby('native-country')['적용 coulmn'].집계함수()
df.groupby('native-country')['class'].mean().sort_values(ascending=False)

native-country
France                        0.421053
India                         0.410596
Taiwan                        0.400000
Iran                          0.372881
England                       0.370079
Greece                        0.367347
Yugoslavia                    0.347826
Japan                         0.347826
Canada                        0.346154
Italy                         0.323810
Cambodia                      0.321429
Hungary                       0.315789
Ireland                       0.297297
China                         0.295082
Philippines                   0.288136
Germany                       0.281553
Hong                          0.266667
Cuba                          0.246377
United-States                 0.243977
Poland                        0.195402
Portugal                      0.179104
South                         0.173913
Thailand                      0.166667
Scotland                      0.142857
Jamaica                       0.141509
Ecuador                       0.133333
Haiti                         0.120000
Puerto-Rico                   0.108696
Honduras                      0.100000
Peru                          0.086957
Laos                          0.086957
Vietnam                       0.081395
Trinadad&Tobago               0.074074
El-Salvador                   0.070968
Nicaragua                     0.061224
Mexico                        0.049422
Dominican-Republic            0.048544
Columbia                      0.047059
Outlying-US(Guam-USVI-etc)    0.043478
Guatemala                     0.034091
Holand-Netherlands            0.000000
Name: class, dtype: float64


df[df['native-country'] == 'France'].groupby('occupation')['class'].mean()

occupation
Adm-clerical       0.666667
Craft-repair       0.000000
Exec-managerial    0.555556
Farming-fishing    0.000000
Other-service      0.000000
Priv-house-serv    0.000000
Prof-specialty     0.272727
Protective-serv    1.000000
Sales              1.000000
Tech-support       0.750000
Name: class, dtype: float64


df[df['native-country'] == 'France'].groupby('occupation')['class'].count()

occupation
Adm-clerical        3
Craft-repair        1
Exec-managerial     9
Farming-fishing     1
Other-service       2
Priv-house-serv     2
Prof-specialty     11
Protective-serv     1
Sales               2
Tech-support        4
Name: class, dtype: int64


country_group = df.groupby('native-country')['class'].mean()

country_group

native-country
Cambodia                      0.321429
Canada                        0.346154
China                         0.295082
Columbia                      0.047059
Cuba                          0.246377
Dominican-Republic            0.048544
Ecuador                       0.133333
El-Salvador                   0.070968
England                       0.370079
France                        0.421053
Germany                       0.281553
Greece                        0.367347
Guatemala                     0.034091
Haiti                         0.120000
Holand-Netherlands            0.000000
Honduras                      0.100000
Hong                          0.266667
Hungary                       0.315789
India                         0.410596
Iran                          0.372881
Ireland                       0.297297
Italy                         0.323810
Jamaica                       0.141509
Japan                         0.347826
Laos                          0.086957
Mexico                        0.049422
Nicaragua                     0.061224
Outlying-US(Guam-USVI-etc)    0.043478
Peru                          0.086957
Philippines                   0.288136
Poland                        0.195402
Portugal                      0.179104
Puerto-Rico                   0.108696
Scotland                      0.142857
South                         0.173913
Taiwan                        0.400000
Thailand                      0.166667
Trinadad&Tobago               0.074074
United-States                 0.243977
Vietnam                       0.081395
Yugoslavia                    0.347826
Name: class, dtype: float64


country_group.index

Index(['Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic',
       'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece',
       'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong',
       'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan',
       'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South',
       'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam',
       'Yugoslavia'],
      dtype='object', name='native-country')


country_group = country_group.reset_index()

country_group


df = df.merge(country_group, on='native-country', how='left')


df.head()


df.drop('native-country', axis=1, inplace=True)

df.head()


df = df.rename(columns={'class_x':'class', 'class_y':'native-country'})

df.head()


df.isnull().mean()

age               0.000000
workclass         0.057307
education-num     0.000000
marital-status    0.000000
occupation        0.057512
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
class             0.000000
native-country    0.017546
dtype: float64


df['native-country'].fillna(0, inplace=True)

df.isna().mean()

age               0.000000
workclass         0.057307
education-num     0.000000
marital-status    0.000000
occupation        0.057512
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
class             0.000000
native-country    0.000000
dtype: float64


df['workclass'].value_counts() / len(df)

Private             0.694198
Self-emp-not-inc    0.079071
Local-gov           0.064207
State-gov           0.040559
Self-emp-inc        0.034704
Federal-gov         0.029319
Without-pay         0.000430
Never-worked        0.000205
Name: workclass, dtype: float64


df['workclass'].fillna('Private', inplace=True)

df['workclass'].isnull().sum()

0


df.isna().mean()

age               0.000000
workclass         0.000000
education-num     0.000000
marital-status    0.000000
occupation        0.057512
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
class             0.000000
native-country    0.000000
dtype: float64


df['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64


df = pd.get_dummies(df, drop_first=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48842 entries, 0 to 48841
Data columns (total 43 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   age                                   48842 non-null  int64  
 1   education-num                         48842 non-null  int64  
 2   sex                                   48842 non-null  uint8  
 3   capital-gain                          48842 non-null  int64  
 4   capital-loss                          48842 non-null  int64  
 5   hours-per-week                        48842 non-null  int64  
 6   class                                 48842 non-null  int64  
 7   native-country                        48842 non-null  float64
 8   workclass_Local-gov                   48842 non-null  uint8  
 9   workclass_Never-worked                48842 non-null  uint8  
 10  workclass_Private                     48842 non-null  uint8  
 11  workclass_Self-emp-inc                48842 non-null  uint8  
 12  workclass_Self-emp-not-inc            48842 non-null  uint8  
 13  workclass_State-gov                   48842 non-null  uint8  
 14  workclass_Without-pay                 48842 non-null  uint8  
 15  marital-status_Married-AF-spouse      48842 non-null  uint8  
 16  marital-status_Married-civ-spouse     48842 non-null  uint8  
 17  marital-status_Married-spouse-absent  48842 non-null  uint8  
 18  marital-status_Never-married          48842 non-null  uint8  
 19  marital-status_Separated              48842 non-null  uint8  
 20  marital-status_Widowed                48842 non-null  uint8  
 21  occupation_Armed-Forces               48842 non-null  uint8  
 22  occupation_Craft-repair               48842 non-null  uint8  
 23  occupation_Exec-managerial            48842 non-null  uint8  
 24  occupation_Farming-fishing            48842 non-null  uint8  
 25  occupation_Handlers-cleaners          48842 non-null  uint8  
 26  occupation_Machine-op-inspct          48842 non-null  uint8  
 27  occupation_Other-service              48842 non-null  uint8  
 28  occupation_Priv-house-serv            48842 non-null  uint8  
 29  occupation_Prof-specialty             48842 non-null  uint8  
 30  occupation_Protective-serv            48842 non-null  uint8  
 31  occupation_Sales                      48842 non-null  uint8  
 32  occupation_Tech-support               48842 non-null  uint8  
 33  occupation_Transport-moving           48842 non-null  uint8  
 34  relationship_Not-in-family            48842 non-null  uint8  
 35  relationship_Other-relative           48842 non-null  uint8  
 36  relationship_Own-child                48842 non-null  uint8  
 37  relationship_Unmarried                48842 non-null  uint8  
 38  relationship_Wife                     48842 non-null  uint8  
 39  race_Asian-Pac-Islander               48842 non-null  uint8  
 40  race_Black                            48842 non-null  uint8  
 41  race_Other                            48842 non-null  uint8  
 42  race_White                            48842 non-null  uint8  
dtypes: float64(1), int64(6), uint8(36)
memory usage: 4.7 MB


from sklearn.model_selection import train_test_split

X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=814
)


from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=814)
model.fit(X_train, y_train)
pred = model.predict(X_test)

pred

array([0, 0, 0, ..., 1, 0, 0])


from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

0.8173752815123183


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=814
)

model = DecisionTreeClassifier(random_state=100)
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print(f'Train score: {accuracy_score(y_train, train_pred)}')
print(f'Test score: {accuracy_score(y_test, test_pred)}')

Train score: 0.9880221130221131
Test score: 0.8129702615549982


model = DecisionTreeClassifier(max_depth=5, random_state=814)
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print('Train score:', accuracy_score(y_train, train_pred))
print('Test score:', accuracy_score(y_test, test_pred))

Train score: 0.8536036036036037
Test score: 0.8465219839279317


def tuning_depth(depth: int):
    model = DecisionTreeClassifier(max_depth=depth, random_state=814)
    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    print('depth=', depth)
    print('Train score:', accuracy_score(y_train, train_pred))
    print('Test score:', accuracy_score(y_test, test_pred))


for i in range(1, 21):
    tuning_depth(i)

depth= 1
Train score: 0.7619778869778869
Test score: 0.7604033372575114
depth= 2
Train score: 0.8293407043407044
Test score: 0.8289655525413319
depth= 3
Train score: 0.8455159705159705
Test score: 0.8438091825766494
depth= 4
Train score: 0.8471539721539721
Test score: 0.8434252955929774
depth= 5
Train score: 0.8536036036036037
Test score: 0.8465219839279317
depth= 6
Train score: 0.8610769860769861
Test score: 0.8517428469058709
depth= 7
Train score: 0.8645577395577395
Test score: 0.8512565900598864
depth= 8
Train score: 0.871007371007371
Test score: 0.8473153503608538
depth= 9
Train score: 0.8769451269451269
Test score: 0.8480575318626197
depth= 10
Train score: 0.8833947583947583
Test score: 0.8500537441777141
depth= 11
Train score: 0.8908681408681409
Test score: 0.8482110866560885
depth= 12
Train score: 0.8990581490581491
Test score: 0.8442698469570559
depth= 13
Train score: 0.9057125307125307
Test score: 0.8409172339663203
depth= 14
Train score: 0.9129811629811629
Test score: 0.8405845319138046
depth= 15
Train score: 0.9206592956592956
Test score: 0.8295029943184726
depth= 16
Train score: 0.9297706797706797
Test score: 0.8263295285867841
depth= 17
Train score: 0.937039312039312
Test score: 0.8249475354455649
depth= 18
Train score: 0.944000819000819
Test score: 0.8246148333930491
depth= 19
Train score: 0.949017199017199
Test score: 0.8229257306648923
depth= 20
Train score: 0.9543407043407044
Test score: 0.8177560526181092


from sklearn.tree import plot_tree

# 그래프 크기 설정
plt.figure(figsize=(30, 15))
plot_tree(model)
plt.show()


plt.figure(figsize=(30, 15))
plot_tree(model, max_depth=3, fontsize=15)

plt.show()


plt.figure(figsize=(30,15))
plot_tree(model, max_depth = 3, fontsize=15, feature_names=X_train.columns)

plt.show()


!pip install mlxtend --quiet


import joblib

model = DecisionTreeClassifier(max_depth=8)
model.fit(X_train, y_train)

joblib.dump(model, 'DecisionTree_model.pkl')

['DecisionTree_model.pkl']

	age	education-num	capital-gain	capital-loss	hours-per-week
count	48842.000000	48842.000000	48842.000000	48842.000000	48842.000000
mean	38.643585	10.078089	1079.067626	87.502314	40.422382
std	13.710510	2.570973	7452.019058	403.004552	12.391444
min	17.000000	1.000000	0.000000	0.000000	1.000000
25%	28.000000	9.000000	0.000000	0.000000	40.000000
50%	37.000000	10.000000	0.000000	0.000000	40.000000
75%	48.000000	12.000000	0.000000	0.000000	45.000000
max	90.000000	16.000000	99999.000000	4356.000000	99.000000

	age	workclass	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	capital-loss	hours-per-week	native-country	class
count	48842.000000	46043	48842	48842.000000	48842	46033	48842	48842	48842	48842.000000	48842.000000	48842.000000	47985	48842
unique	NaN	8	16	NaN	7	14	6	5	2	NaN	NaN	NaN	41	2
top	NaN	Private	HS-grad	NaN	Married-civ-spouse	Prof-specialty	Husband	White	Male	NaN	NaN	NaN	United-States	<=50K
freq	NaN	33906	15784	NaN	22379	6172	19716	41762	32650	NaN	NaN	NaN	43832	37155
mean	38.643585	NaN	NaN	10.078089	NaN	NaN	NaN	NaN	NaN	1079.067626	87.502314	40.422382	NaN	NaN
std	13.710510	NaN	NaN	2.570973	NaN	NaN	NaN	NaN	NaN	7452.019058	403.004552	12.391444	NaN	NaN
min	17.000000	NaN	NaN	1.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	1.000000	NaN	NaN
25%	28.000000	NaN	NaN	9.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	40.000000	NaN	NaN
50%	37.000000	NaN	NaN	10.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	40.000000	NaN	NaN
75%	48.000000	NaN	NaN	12.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	45.000000	NaN	NaN
max	90.000000	NaN	NaN	16.000000	NaN	NaN	NaN	NaN	NaN	99999.000000	4356.000000	99.000000	NaN	NaN

scikit-learn Machine Learning XGBoost 사이킷런 머신러닝 XGBoost (0)	2022.11.21
scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트 (0)	2022.11.21
scikit-learn Machine Learning Naive Bayes Kor NLP 사이킷런 머신러닝 나이브베이즈 한글 자연어처리 (0)	2022.11.21
scikit-learn Machine Learning Naive Bayes Eng NLP 사이킷런 머신러닝 나이브베이즈 영어 자연어처리 (0)	2022.11.15
scikit-learn Machine Learning k-NN algorithm 사이킷런 머신러닝 k-NN 알고리즘 (0)	2022.11.15

Try to 개발자 EthanJ의 성장 로그

Try to 개발자 EthanJ의 성장 로그

scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리 본문

scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리

scikit-learn Machine Learning DecisionTree
사이킷런 머신러닝 결정트리

Data Collection¶

Data pre-processing¶

범주형 변수 전처리¶

`class`¶

`sex`¶

`education`, `education-num`¶

`occupation`¶

`native-country`¶

결측치 처리¶

`native-country`: 임의의 값 넣기¶

`workclass`¶

`occupation`¶

더미변수화¶

Training Model¶

Evaluating Model¶

Hyperparameter tuning¶

tree depth가 깊어지면 overfitting이 발생한다¶

트리 그래프¶

'CS & DS > scikit-learn Machine Learning' 카테고리의 다른 글

티스토리툴바

	age	workclass	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	class
0	25	Private	11th	7	Never-married	Machine-op-inspct	Own-child	Black	Male	0	40	United-States	<=50K
1	38	Private	HS-grad	9	Married-civ-spouse	Farming-fishing	Husband	White	Male	0	50	United-States	<=50K
2	28	Local-gov	Assoc-acdm	12	Married-civ-spouse	Protective-serv	Husband	White	Male	0	40	United-States	>50K
3	44	Private	Some-college	10	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	7688	40	United-States	>50K
4	18	NaN	Some-college	10	Never-married	NaN	Own-child	White	Female	0	30	United-States	<=50K

	age	workclass	education	education-num	marital-status	occupation	relationship	race	sex	capital-gain	hours-per-week	native-country	class
48837	27	Private	Assoc-acdm	12	Married-civ-spouse	Tech-support	Wife	White	Female	0	38	United-States	<=50K
48838	40	Private	HS-grad	9	Married-civ-spouse	Machine-op-inspct	Husband	White	Male	0	40	United-States	>50K
48839	58	Private	HS-grad	9	Widowed	Adm-clerical	Unmarried	White	Female	0	40	United-States	<=50K
48840	22	Private	HS-grad	9	Never-married	Adm-clerical	Own-child	White	Male	0	20	United-States	<=50K
48841	52	Self-emp-inc	HS-grad	9	Married-civ-spouse	Exec-managerial	Wife	White	Female	15024	40	United-States	>50K

	native-country	class
0	Cambodia	0.321429
1	Canada	0.346154
2	China	0.295082
3	Columbia	0.047059
4	Cuba	0.246377
5	Dominican-Republic	0.048544
6	Ecuador	0.133333
7	El-Salvador	0.070968
8	England	0.370079
9	France	0.421053
10	Germany	0.281553
11	Greece	0.367347
12	Guatemala	0.034091
13	Haiti	0.120000
14	Holand-Netherlands	0.000000
15	Honduras	0.100000
16	Hong	0.266667
17	Hungary	0.315789
18	India	0.410596
19	Iran	0.372881
20	Ireland	0.297297
21	Italy	0.323810
22	Jamaica	0.141509
23	Japan	0.347826
24	Laos	0.086957
25	Mexico	0.049422
26	Nicaragua	0.061224
27	Outlying-US(Guam-USVI-etc)	0.043478
28	Peru	0.086957
29	Philippines	0.288136
30	Poland	0.195402
31	Portugal	0.179104
32	Puerto-Rico	0.108696
33	Scotland	0.142857
34	South	0.173913
35	Taiwan	0.400000
36	Thailand	0.166667
37	Trinadad&Tobago	0.074074
38	United-States	0.243977
39	Vietnam	0.081395
40	Yugoslavia	0.347826

« 2025/07 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

Try to 개발자 EthanJ의 성장 로그

scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리 본문

scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리

scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리

Data Collection¶

Data pre-processing¶

범주형 변수 전처리¶

class¶

sex¶

education, education-num¶

occupation¶

native-country¶

결측치 처리¶

native-country: 임의의 값 넣기¶

workclass¶

occupation¶

더미변수화¶

Training Model¶

Evaluating Model¶

Hyperparameter tuning¶

tree depth가 깊어지면 overfitting이 발생한다¶

트리 그래프¶

'CS & DS > scikit-learn Machine Learning' 카테고리의 다른 글

티스토리툴바

scikit-learn Machine Learning DecisionTree
사이킷런 머신러닝 결정트리

`class`¶

`sex`¶

`education`, `education-num`¶

`occupation`¶

`native-country`¶

`native-country`: 임의의 값 넣기¶

`workclass`¶

`occupation`¶