import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# 'https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho'

file_url = 'https://raw.githubusercontent.com/dev-EthanJ/scikit-learn_Machine_Learning/main/data/'
file_name = 'car.csv'
df = pd.read_csv(file_url + file_name)

df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           8128 non-null   object 
 1   year           8128 non-null   int64  
 2   selling_price  8128 non-null   int64  
 3   km_driven      8128 non-null   int64  
 4   fuel           8128 non-null   object 
 5   seller_type    8128 non-null   object 
 6   transmission   8128 non-null   object 
 7   owner          8128 non-null   object 
 8   mileage        7907 non-null   object 
 9   engine         7907 non-null   object 
 10  max_power      7913 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7907 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 825.6+ KB


missing_value = df.isnull().sum()

missing_value

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          221
engine           221
max_power        215
torque           222
seats            221
dtype: int64


# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html
ax = missing_value[missing_value.gt(0)].plot(xlim=(0, 250), kind='barh')

for p in ax.patches:
    ax.annotate(str(p.get_width()), (p.get_width(), p.get_y()))

plt.show()


missing_value = df.isnull().mean() * 100

ax = missing_value[missing_value.gt(0)].plot(xlim=(0, 100), kind='barh')

for p in ax.patches:
    ax.annotate(str(p.get_width()), (p.get_width() * 1.05, p.get_y() + 0.15))

plt.show()


df.isna().mean()

name             0.000000
year             0.000000
selling_price    0.000000
km_driven        0.000000
fuel             0.000000
seller_type      0.000000
transmission     0.000000
owner            0.000000
mileage          0.027190
engine           0.027190
max_power        0.026452
torque           0.027313
seats            0.027190
dtype: float64


df = df.dropna()
len(df)

7906


pd.options.display.float_format = '{:,.2f}'.format

df.describe()


df.boxplot()

<matplotlib.axes._subplots.AxesSubplot at 0x7f337d7d7290>


# subplots
fig, (axis1, axis2) = plt.subplots(1, 2)

axis1 = df.selling_price.plot.box(ax=axis1)
axis2 = df.km_driven.plot.box(ax=axis2)

plt.show()


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7906 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   object 
 1   year           7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   object 
 5   seller_type    7906 non-null   object 
 6   transmission   7906 non-null   object 
 7   owner          7906 non-null   object 
 8   mileage        7906 non-null   object 
 9   engine         7906 non-null   object 
 10  max_power      7906 non-null   object 
 11  torque         7906 non-null   object 
 12  seats          7906 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 864.7+ KB


df[['mileage', 'engine', 'max_power', 'torque']].head(10)


df.describe(include=['O'])


df['engine'].str.split().head()

0    [1248, CC]
1    [1498, CC]
2    [1497, CC]
3    [1396, CC]
4    [1298, CC]
Name: engine, dtype: object


df['engine'].str.split(expand=True).head()


df[['engine', 'engine_unit']] = df.engine.str.split(expand=True)

df[['engine', 'engine_unit']].head()


df.engine.head()

0    1248
1    1498
2    1497
3    1396
4    1298
Name: engine, dtype: object


df['engine'] = df['engine'].astype('float32')

df['engine'].head()

0   1,248.00
1   1,498.00
2   1,497.00
3   1,396.00
4   1,298.00
Name: engine, dtype: float32


df['engine_unit'].unique()

array(['CC'], dtype=object)


df = df.drop(columns=['engine_unit'])

df.head()


df['max_power'].head()

0        74 bhp
1    103.52 bhp
2        78 bhp
3        90 bhp
4      88.2 bhp
Name: max_power, dtype: object


df[['max_power', 'max_power_unit']] = df['max_power'].str.split(expand=True)

df['max_power'].head()

0        74
1    103.52
2        78
3        90
4      88.2
Name: max_power, dtype: object


df['max_power'] = df['max_power'].astype('float')

df['max_power'].head()

0    74.00
1   103.52
2    78.00
3    90.00
4    88.20
Name: max_power, dtype: float64


df['max_power_unit'].unique()

array(['bhp'], dtype=object)


df = df.drop(columns=['max_power_unit'])

df.head()


df['mileage'].head()

0     23.4 kmpl
1    21.14 kmpl
2     17.7 kmpl
3     23.0 kmpl
4     16.1 kmpl
Name: mileage, dtype: object


df[['mileage', 'mileage_unit']] = df['mileage'].str.split(expand=True)

df['mileage'].head()

0     23.4
1    21.14
2     17.7
3     23.0
4     16.1
Name: mileage, dtype: object


df['mileage'] = df['mileage'].astype('float')

df['mileage'].head()

0   23.40
1   21.14
2   17.70
3   23.00
4   16.10
Name: mileage, dtype: float64


df['mileage_unit'].unique()

array(['kmpl', 'km/kg'], dtype=object)


df['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)


fuels = {
    'Petrol': 80.43,
    'Diesel': 73.56,
    'LPG': 40.85,
    'CNG': 44.23
}

def manipulate_mileage(row):
    return row['mileage'] / fuels[row['fuel']]


df['mileage'] = df.apply(manipulate_mileage, axis=1)

df['mileage'].head()

0   0.32
1   0.29
2   0.22
3   0.31
4   0.20
Name: mileage, dtype: float64


df = df.drop('mileage_unit', axis=1)

df.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner', 'mileage', 'engine', 'max_power', 'torque',
       'seats'],
      dtype='object')


df['torque'].head(10)

0              190Nm@ 2000rpm
1         250Nm@ 1500-2500rpm
2       12.7@ 2,700(kgm@ rpm)
3    22.4 kgm at 1750-2750rpm
4       11.5@ 4,500(kgm@ rpm)
5           113.75nm@ 4000rpm
6        7.8@ 4,500(kgm@ rpm)
7               59Nm@ 2500rpm
8         170Nm@ 1800-2400rpm
9              160Nm@ 2000rpm
Name: torque, dtype: object


df['torque'] = df['torque'].str.upper()

df['torque'].head(10)

0              190NM@ 2000RPM
1         250NM@ 1500-2500RPM
2       12.7@ 2,700(KGM@ RPM)
3    22.4 KGM AT 1750-2750RPM
4       11.5@ 4,500(KGM@ RPM)
5           113.75NM@ 4000RPM
6        7.8@ 4,500(KGM@ RPM)
7               59NM@ 2500RPM
8         170NM@ 1800-2400RPM
9              160NM@ 2000RPM
Name: torque, dtype: object


def change_torque_unit(tq):
    if 'NM' in str(tq):
        return 'Nm'
    if 'KGM' in str(tq):
        return 'kgm'


df['torque_unit'] = df['torque'].apply(change_torque_unit)

df['torque_unit'].unique()

array(['Nm', 'kgm', None], dtype=object)


df[df['torque_unit'].isna()]['torque'].unique()

array(['250@ 1250-5000RPM', '510@ 1600-2400', '110(11.2)@ 4800',
       '210 / 1900'], dtype=object)


df['torque_unit'] = df['torque_unit'].fillna('Nm')

df['torque_unit'].isna().sum()

0


df['torque'] = df['torque'].str.extract('([0-9\.]+)')

df['torque'].head(10)

0       190
1       250
2      12.7
3      22.4
4      11.5
5    113.75
6       7.8
7        59
8       170
9       160
Name: torque, dtype: object


df['torque'] = df['torque'].astype('float')

df['torque'].head()

0   190.00
1   250.00
2    12.70
3    22.40
4    11.50
Name: torque, dtype: float64


def trans_torque(row):
    return row['torque'] * 9.8066 if row['torque_unit'] == 'kgm' else row['torque']


df['torque'] = df.apply(trans_torque, axis=1)

df['torque'].head()

0   190.00
1   250.00
2   124.54
3   219.67
4   112.78
Name: torque, dtype: float64


df = df.drop(columns=['torque_unit'])

df.head()


df['name'].nunique()

1982


df['name'].head(10)

0                  Maruti Swift Dzire VDI
1            Skoda Rapid 1.5 TDI Ambition
2                Honda City 2017-2020 EXi
3               Hyundai i20 Sportz Diesel
4                  Maruti Swift VXI BSIII
5           Hyundai Xcent 1.2 VTVT E Plus
6            Maruti Wagon R LXI DUO BSIII
7                      Maruti 800 DX BSII
8                        Toyota Etios VXD
9    Ford Figo Diesel Celebration Edition
Name: name, dtype: object


df['name'] = df['name'].str.split(expand=True)[0]

df['name'].nunique()

31


np.sort(df['name'].unique())

array(['Ambassador', 'Ashok', 'Audi', 'BMW', 'Chevrolet', 'Daewoo',
       'Datsun', 'Fiat', 'Force', 'Ford', 'Honda', 'Hyundai', 'Isuzu',
       'Jaguar', 'Jeep', 'Kia', 'Land', 'Lexus', 'MG', 'Mahindra',
       'Maruti', 'Mercedes-Benz', 'Mitsubishi', 'Nissan', 'Opel',
       'Renault', 'Skoda', 'Tata', 'Toyota', 'Volkswagen', 'Volvo'],
      dtype=object)


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7906 entries, 0 to 8127
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   name           7906 non-null   object 
 1   year           7906 non-null   int64  
 2   selling_price  7906 non-null   int64  
 3   km_driven      7906 non-null   int64  
 4   fuel           7906 non-null   object 
 5   seller_type    7906 non-null   object 
 6   transmission   7906 non-null   object 
 7   owner          7906 non-null   object 
 8   mileage        7906 non-null   float64
 9   engine         7906 non-null   float32
 10  max_power      7906 non-null   float64
 11  torque         7906 non-null   float64
 12  seats          7906 non-null   float64
dtypes: float32(1), float64(4), int64(3), object(5)
memory usage: 833.8+ KB


[(label, df[label].nunique()) for label in df.columns if df[label].dtype == 'object']

[('name', 31),
 ('fuel', 4),
 ('seller_type', 3),
 ('transmission', 2),
 ('owner', 5)]


df = pd.get_dummies(df, columns=['name', 'fuel', 'seller_type', 'transmission', 'owner'],
                    drop_first=True)

df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7906 entries, 0 to 8127
Data columns (total 48 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   year                          7906 non-null   int64  
 1   selling_price                 7906 non-null   int64  
 2   km_driven                     7906 non-null   int64  
 3   mileage                       7906 non-null   float64
 4   engine                        7906 non-null   float32
 5   max_power                     7906 non-null   float64
 6   torque                        7906 non-null   float64
 7   seats                         7906 non-null   float64
 8   name_Ashok                    7906 non-null   uint8  
 9   name_Audi                     7906 non-null   uint8  
 10  name_BMW                      7906 non-null   uint8  
 11  name_Chevrolet                7906 non-null   uint8  
 12  name_Daewoo                   7906 non-null   uint8  
 13  name_Datsun                   7906 non-null   uint8  
 14  name_Fiat                     7906 non-null   uint8  
 15  name_Force                    7906 non-null   uint8  
 16  name_Ford                     7906 non-null   uint8  
 17  name_Honda                    7906 non-null   uint8  
 18  name_Hyundai                  7906 non-null   uint8  
 19  name_Isuzu                    7906 non-null   uint8  
 20  name_Jaguar                   7906 non-null   uint8  
 21  name_Jeep                     7906 non-null   uint8  
 22  name_Kia                      7906 non-null   uint8  
 23  name_Land                     7906 non-null   uint8  
 24  name_Lexus                    7906 non-null   uint8  
 25  name_MG                       7906 non-null   uint8  
 26  name_Mahindra                 7906 non-null   uint8  
 27  name_Maruti                   7906 non-null   uint8  
 28  name_Mercedes-Benz            7906 non-null   uint8  
 29  name_Mitsubishi               7906 non-null   uint8  
 30  name_Nissan                   7906 non-null   uint8  
 31  name_Opel                     7906 non-null   uint8  
 32  name_Renault                  7906 non-null   uint8  
 33  name_Skoda                    7906 non-null   uint8  
 34  name_Tata                     7906 non-null   uint8  
 35  name_Toyota                   7906 non-null   uint8  
 36  name_Volkswagen               7906 non-null   uint8  
 37  name_Volvo                    7906 non-null   uint8  
 38  fuel_Diesel                   7906 non-null   uint8  
 39  fuel_LPG                      7906 non-null   uint8  
 40  fuel_Petrol                   7906 non-null   uint8  
 41  seller_type_Individual        7906 non-null   uint8  
 42  seller_type_Trustmark Dealer  7906 non-null   uint8  
 43  transmission_Manual           7906 non-null   uint8  
 44  owner_Fourth & Above Owner    7906 non-null   uint8  
 45  owner_Second Owner            7906 non-null   uint8  
 46  owner_Test Drive Car          7906 non-null   uint8  
 47  owner_Third Owner             7906 non-null   uint8  
dtypes: float32(1), float64(4), int64(3), uint8(40)
memory usage: 833.8 KB


from sklearn.model_selection import train_test_split

X = df.drop('selling_price', axis=1)
y = df['selling_price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=814
)


from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state=123)

model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)


from sklearn.metrics import mean_squared_error

print(
    "train_rmse: ", mean_squared_error(y_train, train_pred, squared=False),
    "\ntest_rmse: ", mean_squared_error(y_test, test_pred, squared=False)
)

train_rmse:  48436.98037690148 
test_rmse:  179288.6537737316


from sklearn.model_selection import KFold

df = df.reset_index(drop=True)

df.index

RangeIndex(start=0, stop=7906, step=1)


kf = KFold(n_splits=5)

X = df.drop('selling_price', axis=1)
y = df['selling_price']

for train, test in kf.split(X):
    print("test : ", test)
    print("trian: ", train)
    print()

test :  [   0    1    2 ... 1579 1580 1581]
trian:  [1582 1583 1584 ... 7903 7904 7905]

test :  [1582 1583 1584 ... 3160 3161 3162]
trian:  [   0    1    2 ... 7903 7904 7905]

test :  [3163 3164 3165 ... 4741 4742 4743]
trian:  [   0    1    2 ... 7903 7904 7905]

test :  [4744 4745 4746 ... 6322 6323 6324]
trian:  [   0    1    2 ... 7903 7904 7905]

test :  [6325 6326 6327 ... 7903 7904 7905]
trian:  [   0    1    2 ... 6322 6323 6324]


total_train_rmse = list()
total_test_rmse = list()

# train set과 test set의 반복
for train_index, test_index in kf.split(X):
  
  # 독립변수 X = DataFrame: X.loc[index]
  X_train, X_test = X.loc[train_index], X.loc[test_index]
  
  # 종속변수 y = Series: y[column_label] indexing
  y_train, y_test = y[train_index], y[test_index]

  model = RandomForestRegressor(random_state=np.random.randint(1000))

  # 학습
  model.fit(X_train, y_train)

  # 예측
  train_pred = model.predict(X_train)
  test_pred = model.predict(X_test)

  # RMSE list에 추가
  total_train_rmse.append(mean_squared_error(y_train, train_pred, squared=False))
  total_test_rmse.append(mean_squared_error(y_test, test_pred, squared=False))

for idx in range(len(total_train_rmse)):
    print(total_train_rmse[idx], total_test_rmse[idx])

51242.051884221226 165206.70331799242
56073.3901592933 133310.48356030104
51870.40718729253 124608.70255010937
53648.060317139876 151825.15063185713
56876.22158334168 144253.88967046022


train_rmse_total = list()
test_rmse_total = list()

# train set과 test set의 반복
for train_index, test_index in kf.split(X):
  
  # 독립변수 X = DataFrame: X.loc[index]
  X_train, X_test = X.loc[train_index], X.loc[test_index]
  
  # 종속변수 y = Series: y[column label] indexing
  y_train, y_test = y[train_index], y[test_index]

  model = RandomForestRegressor(
      n_estimators=300,
      max_depth=50,
      min_samples_split=5,
      min_samples_leaf=1,
      n_jobs=-1,
      random_state=np.random.randint(1000))

  # 학습
  model.fit(X_train, y_train)

  # 예측
  train_pred = model.predict(X_train)
  test_pred = model.predict(X_test)

  # RMSE list에 추가
  train_rmse_total.append(mean_squared_error(y_train, train_pred, squared=False))
  test_rmse_total.append(mean_squared_error(y_test, test_pred, squared=False))


print(
    "train_rmse: %.8f" %(sum(train_rmse_total) / len(train_rmse_total)),
    "\n test_rmse: %.8f" %(sum(test_rmse_total) / len(test_rmse_total))
)

train_rmse: 67252.92642159 
 test_rmse: 143524.08060111


import joblib

joblib.dump(model, 'RandomForest_model.pkl')

['RandomForest_model.pkl']

	year	selling_price	km_driven	seats
count	7,906.00	7,906.00	7,906.00	7,906.00
mean	2,013.98	649,813.72	69,188.66	5.42
std	3.86	813,582.75	56,792.30	0.96
min	1,994.00	29,999.00	1.00	2.00
25%	2,012.00	270,000.00	35,000.00	5.00
50%	2,015.00	450,000.00	60,000.00	5.00
75%	2,017.00	690,000.00	95,425.00	5.00
max	2,020.00	10,000,000.00	2,360,457.00	14.00

scikit-learn Machine Learning LightGBM 사이킷런 머신러닝 LightGBM (0)	2022.11.21
scikit-learn Machine Learning XGBoost 사이킷런 머신러닝 XGBoost (0)	2022.11.21
scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리 (0)	2022.11.21
scikit-learn Machine Learning Naive Bayes Kor NLP 사이킷런 머신러닝 나이브베이즈 한글 자연어처리 (0)	2022.11.21
scikit-learn Machine Learning Naive Bayes Eng NLP 사이킷런 머신러닝 나이브베이즈 영어 자연어처리 (0)	2022.11.15

Try to 개발자 EthanJ의 성장 로그

Try to 개발자 EthanJ의 성장 로그

scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트 본문

scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트

scikit-learn Machine Learning RandomForest
사이킷런 머신러닝 랜덤포레스트

1. Data Collection¶

2. Data pre-processing¶

결측치 처리¶

이상치(Outlier)¶

단위 있거나 텍스트인 Data¶

`engine` 단위 처리¶

`max_power` 단위 처리¶

`mileage` 단위 처리¶

`fuel` 단위 처리¶

`torque` 단위 처리¶

Categorical Data 처리¶

`name`¶

`get_dummmies()`¶

3. Training Model¶

4. Evaluating Model¶

K-Fold 교차검증¶

Hyperparameter tuning¶

'CS & DS > scikit-learn Machine Learning' 카테고리의 다른 글

티스토리툴바

	name	year	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	torque	seats
0	Maruti Swift Dzire VDI	2014	450000	145500	Diesel	Individual	Manual	First Owner	23.4 kmpl	1248 CC	74 bhp	190Nm@ 2000rpm	5.0
1	Skoda Rapid 1.5 TDI Ambition	2014	370000	120000	Diesel	Individual	Manual	Second Owner	21.14 kmpl	1498 CC	103.52 bhp	250Nm@ 1500-2500rpm	5.0
2	Honda City 2017-2020 EXi	2006	158000	140000	Petrol	Individual	Manual	Third Owner	17.7 kmpl	1497 CC	78 bhp	12.7@ 2,700(kgm@ rpm)	5.0
3	Hyundai i20 Sportz Diesel	2010	225000	127000	Diesel	Individual	Manual	First Owner	23.0 kmpl	1396 CC	90 bhp	22.4 kgm at 1750-2750rpm	5.0
4	Maruti Swift VXI BSIII	2007	130000	120000	Petrol	Individual	Manual	First Owner	16.1 kmpl	1298 CC	88.2 bhp	11.5@ 4,500(kgm@ rpm)	5.0

	name	fuel	seller_type	transmission	owner	mileage	engine	max_power	torque
count	7906	7906	7906	7906	7906	7906	7906	7906	7906
unique	1982	4	3	2	5	393	121	320	441
top	Maruti Swift Dzire VDI	Diesel	Individual	Manual	First Owner	18.9 kmpl	1248 CC	74 bhp	190Nm@ 2000rpm
freq	129	4299	6563	6865	5215	225	1017	377	530

A	B	C	D	E
train	train	train	train	test
train	train	train	test	train
...	...	...	...	...	...
test	train	train	train	train

« 2025/06 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

Try to 개발자 EthanJ의 성장 로그

scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트 본문

scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트

scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트

1. Data Collection¶

2. Data pre-processing¶

결측치 처리¶

이상치(Outlier)¶

단위 있거나 텍스트인 Data¶

engine 단위 처리¶

max_power 단위 처리¶

mileage 단위 처리¶

fuel 단위 처리¶

torque 단위 처리¶

Categorical Data 처리¶

name¶

get_dummmies()¶

3. Training Model¶

4. Evaluating Model¶

K-Fold 교차검증¶

Hyperparameter tuning¶

'CS & DS > scikit-learn Machine Learning' 카테고리의 다른 글

티스토리툴바

scikit-learn Machine Learning RandomForest
사이킷런 머신러닝 랜덤포레스트

`engine` 단위 처리¶

`max_power` 단위 처리¶

`mileage` 단위 처리¶

`fuel` 단위 처리¶

`torque` 단위 처리¶

`name`¶

`get_dummmies()`¶