import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# https://www.kaggle.com/datasets/kartik2112/fraud-detection

file_url = 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/fraud.csv'
df = pd.read_csv(file_url)

df.head()


df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  job                    object 
 16  dob                    object 
 17  trans_num              object 
 18  unix_time              int64  
 19  merch_lat              float64
 20  merch_long             float64
 21  is_fraud               int64  
dtypes: float64(5), int64(5), object(12)
memory usage: 310.9+ MB


df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1852394 non-null  object 
 1   cc_num                 1852394 non-null  int64  
 2   merchant               1852394 non-null  object 
 3   category               1852394 non-null  object 
 4   amt                    1852394 non-null  float64
 5   first                  1852394 non-null  object 
 6   last                   1852394 non-null  object 
 7   gender                 1852394 non-null  object 
 8   street                 1852394 non-null  object 
 9   city                   1852394 non-null  object 
 10  state                  1852394 non-null  object 
 11  zip                    1852394 non-null  int64  
 12  lat                    1852394 non-null  float64
 13  long                   1852394 non-null  float64
 14  city_pop               1852394 non-null  int64  
 15  job                    1852394 non-null  object 
 16  dob                    1852394 non-null  object 
 17  trans_num              1852394 non-null  object 
 18  unix_time              1852394 non-null  int64  
 19  merch_lat              1852394 non-null  float64
 20  merch_long             1852394 non-null  float64
 21  is_fraud               1852394 non-null  int64  
dtypes: float64(5), int64(5), object(12)
memory usage: 310.9+ MB


pd.options.display.float_format = '{:.2f}'.format

df.describe()


df = df.drop(columns=['first', 'last', 'street', 'city', 'state', 'zip',
                      'trans_num', 'unix_time', 'job', 'merchant'])

df.columns

Index(['trans_date_trans_time', 'cc_num', 'category', 'amt', 'gender', 'lat',
       'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 12 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   cc_num                 int64         
 2   category               object        
 3   amt                    float64       
 4   gender                 object        
 5   lat                    float64       
 6   long                   float64       
 7   city_pop               int64         
 8   dob                    object        
 9   merch_lat              float64       
 10  merch_long             float64       
 11  is_fraud               int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(3)
memory usage: 169.6+ MB


# 각 카드번호별 '결제금액'의 평균, 표준편차
amt_info = df.groupby('cc_num').agg(['mean', 'std'])['amt'].reset_index()

amt_info.head()

C:\Users\EthanJ\AppData\Local\Temp\ipykernel_14708\3458248078.py:2: FutureWarning: ['trans_date_trans_time', 'category', 'gender', 'dob'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
  amt_info = df.groupby('cc_num').agg(['mean', 'std'])['amt'].reset_index()


df = df.merge(amt_info, on='cc_num', how='left')

df.columns

Index(['trans_date_trans_time', 'cc_num', 'category', 'amt', 'gender', 'lat',
       'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud',
       'mean', 'std'],
      dtype='object')


df[['cc_num', 'amt', 'mean', 'std']].head()


df['amt_z_score'] = (df['amt'] - df['mean']) / df['std']

df['amt_z_score'].head()

0   -0.66
1    0.32
2    1.29
3   -0.13
4   -0.57
Name: amt_z_score, dtype: float64


df = df.drop(columns=['mean', 'std'])

df.columns

Index(['trans_date_trans_time', 'cc_num', 'category', 'amt', 'gender', 'lat',
       'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud',
       'amt_z_score'],
      dtype='object')


category_info = df.groupby(['cc_num', 'category']).agg(['mean', 'std'])['amt'].reset_index()

category_info.head()

C:\Users\EthanJ\AppData\Local\Temp\ipykernel_14708\2233020675.py:1: FutureWarning: ['trans_date_trans_time', 'gender', 'dob'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
  category_info = df.groupby(['cc_num', 'category']).agg(['mean', 'std'])['amt'].reset_index()


df = df.merge(category_info, on=['cc_num', 'category'], how='left')


df.columns

Index(['trans_date_trans_time', 'cc_num', 'category', 'amt', 'gender', 'lat',
       'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud',
       'amt_z_score', 'mean', 'std'],
      dtype='object')


df[['cc_num', 'category', 'amt', 'mean', 'std']].head()


df['cate_z_score'] = (df['amt'] - df['mean']) / df['std']

df['cate_z_score'].head()

0   -0.69
1    0.32
2    2.87
3   -1.05
4    1.31
Name: cate_z_score, dtype: float64


df = df.drop(columns=['mean', 'std'])

df.columns

Index(['trans_date_trans_time', 'cc_num', 'category', 'amt', 'gender', 'lat',
       'long', 'city_pop', 'dob', 'merch_lat', 'merch_long', 'is_fraud',
       'amt_z_score', 'cate_z_score'],
      dtype='object')


import geopy.distance


df['merch_coor'] = pd.Series(zip(df['merch_lat'], df['merch_long']))
df['custo_coor'] = pd.Series(zip(df['lat'], df['long']))


df['distance'] = df.apply(lambda x: geopy.distance.distance(x['merch_coor'], x['custo_coor']).km, axis=1)
df['distance'].head()

0    78.77
1    30.22
2   108.10
3    95.69
4    77.70
Name: distance, dtype: float64


distance_info = df.groupby('cc_num').agg(['mean', 'std'])['distance'].reset_index()

distance_info.head()

C:\Users\EthanJ\AppData\Local\Temp\ipykernel_14708\2201206611.py:1: FutureWarning: ['trans_date_trans_time', 'category', 'gender', 'dob', 'merch_coor', 'custo_coor'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
  distance_info = df.groupby('cc_num').agg(['mean', 'std'])['distance'].reset_index()


df = df.merge(distance_info, on='cc_num', how='left')
df['distance_z_score'] = (df['distance'] - df['mean']) / df['std']

df['distance_z_score'].head()

0    0.03
1   -1.48
2    1.16
3    0.82
4    0.06
Name: distance_z_score, dtype: float64


df = df.drop(['mean', 'std'], axis=1)

df.head()


df = df.drop(columns=['cc_num', 'lat', 'long', 'merch_lat', 'merch_long', 'merch_coor', 'custo_coor'])

df.columns

Index(['trans_date_trans_time', 'category', 'amt', 'gender', 'city_pop', 'dob',
       'is_fraud', 'amt_z_score', 'cate_z_score', 'distance',
       'distance_z_score'],
      dtype='object')


df.isna().sum()

trans_date_trans_time      0
category                   0
amt                        0
gender                     0
city_pop                   0
dob                        0
is_fraud                   0
amt_z_score                0
cate_z_score             219
distance                   0
distance_z_score           0
dtype: int64


df['dob'].head()

0    1988-03-09
1    1978-06-21
2    1962-01-19
3    1967-01-12
4    1986-03-28
Name: dob, dtype: object


df['dob'] = pd.to_datetime(df['dob'])

df['dob']

0         1988-03-09
1         1978-06-21
2         1962-01-19
3         1967-01-12
4         1986-03-28
             ...    
1852389   1966-02-13
1852390   1999-12-27
1852391   1981-11-29
1852392   1965-12-15
1852393   1993-05-10
Name: dob, Length: 1852394, dtype: datetime64[ns]


# 만나이 계산
df['age'] = 2021 - df['dob'].dt.year

df['age']

0          33
1          43
2          59
3          54
4          35
           ..
1852389    55
1852390    22
1852391    40
1852392    56
1852393    28
Name: age, Length: 1852394, dtype: int64


df = df.drop(columns=['dob'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1852394 entries, 0 to 1852393
Data columns (total 11 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   trans_date_trans_time  datetime64[ns]
 1   category               object        
 2   amt                    float64       
 3   gender                 object        
 4   city_pop               int64         
 5   is_fraud               int64         
 6   amt_z_score            float64       
 7   cate_z_score           float64       
 8   distance               float64       
 9   distance_z_score       float64       
 10  age                    int64         
dtypes: datetime64[ns](1), float64(5), int64(3), object(2)
memory usage: 169.6+ MB


df = pd.get_dummies(df, columns=['category', 'gender'], drop_first=True)

df.tail()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1852394 entries, 0 to 1852393
Data columns (total 23 columns):
 #   Column                   Dtype         
---  ------                   -----         
 0   trans_date_trans_time    datetime64[ns]
 1   amt                      float64       
 2   city_pop                 int64         
 3   is_fraud                 int64         
 4   amt_z_score              float64       
 5   cate_z_score             float64       
 6   distance                 float64       
 7   distance_z_score         float64       
 8   age                      int64         
 9   category_food_dining     uint8         
 10  category_gas_transport   uint8         
 11  category_grocery_net     uint8         
 12  category_grocery_pos     uint8         
 13  category_health_fitness  uint8         
 14  category_home            uint8         
 15  category_kids_pets       uint8         
 16  category_misc_net        uint8         
 17  category_misc_pos        uint8         
 18  category_personal_care   uint8         
 19  category_shopping_net    uint8         
 20  category_shopping_pos    uint8         
 21  category_travel          uint8         
 22  gender_M                 uint8         
dtypes: datetime64[ns](1), float64(5), int64(3), uint8(14)
memory usage: 166.1 MB


df = df.set_index('trans_date_trans_time')

df.index

DatetimeIndex(['2019-01-01 00:00:18', '2019-01-01 00:00:44',
               '2019-01-01 00:00:51', '2019-01-01 00:01:16',
               '2019-01-01 00:03:06', '2019-01-01 00:04:08',
               '2019-01-01 00:04:42', '2019-01-01 00:05:08',
               '2019-01-01 00:05:18', '2019-01-01 00:06:01',
               ...
               '2020-12-31 23:57:18', '2020-12-31 23:57:50',
               '2020-12-31 23:57:56', '2020-12-31 23:58:04',
               '2020-12-31 23:58:34', '2020-12-31 23:59:07',
               '2020-12-31 23:59:09', '2020-12-31 23:59:15',
               '2020-12-31 23:59:24', '2020-12-31 23:59:34'],
              dtype='datetime64[ns]', name='trans_date_trans_time', length=1852394, freq=None)


from sklearn.model_selection import train_test_split

X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 814
)


import lightgbm as lgb

model = lgb.LGBMClassifier(random_state=np.random.randint(1000))
model.fit(X_train, y_train)

pred = model.predict(X_test)


from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

0.9966016967223513


1 - df['is_fraud'].mean()

0.9947899852839083


from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score


cf_matrix = confusion_matrix(y_test, pred)
print(cf_matrix)

[[368030    558]
 [   701   1190]]


group_name = ['TN', 'FP', 'FN', 'TP']
group_counts =  ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percent = ["{0:.2%}".format(value) for value in (cf_matrix.flatten() / np.sum(cf_matrix))]

labels = [f"{name}\n{counts}\n{percent}" for name, counts, percent in 
          zip(group_name, group_counts, group_percent)]
labels = np.asarray(labels).reshape(2, 2)


sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='coolwarm')

plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()


print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.68      0.63      0.65      1891

    accuracy                           1.00    370479
   macro avg       0.84      0.81      0.83    370479
weighted avg       1.00      1.00      1.00    370479


proba = model.predict_proba(X_test)

proba

array([[9.99069879e-01, 9.30120853e-04],
       [9.99957475e-01, 4.25253222e-05],
       [9.99929953e-01, 7.00468547e-05],
       ...,
       [9.97326245e-01, 2.67375455e-03],
       [9.99684243e-01, 3.15757316e-04],
       [9.99924891e-01, 7.51086766e-05]])


proba[:, 1]

array([9.30120853e-04, 4.25253222e-05, 7.00468547e-05, ...,
       2.67375455e-03, 3.15757316e-04, 7.51086766e-05])


proba = proba[:, 1]


proba_case1 = (proba > 0.2).astype('int')
proba_case2 = (proba > 0.8).astype('int')


def confusion_matrix_view(y_test, pred):
    cf_matrix = confusion_matrix(y_test, pred)
    print(cf_matrix)
    group_names = ['TN','FP','FN','TP']
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                        cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
            zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot=labels, fmt='', cmap='coolwarm')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()
    print(classification_report(y_test, pred))


confusion_matrix_view(y_test, pred)

[[368030    558]
 [   701   1190]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.68      0.63      0.65      1891

    accuracy                           1.00    370479
   macro avg       0.84      0.81      0.83    370479
weighted avg       1.00      1.00      1.00    370479


confusion_matrix_view(y_test, proba_case1)

[[367529   1059]
 [   552   1339]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.56      0.71      0.62      1891

    accuracy                           1.00    370479
   macro avg       0.78      0.85      0.81    370479
weighted avg       1.00      1.00      1.00    370479


confusion_matrix_view(y_test, proba_case2)

[[368283    305]
 [   933    958]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.76      0.51      0.61      1891

    accuracy                           1.00    370479
   macro avg       0.88      0.75      0.80    370479
weighted avg       1.00      1.00      1.00    370479


from sklearn.model_selection import RandomizedSearchCV


parameters = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.25],
    'lambda_l1': [0, 10, 20, 30, 50],
    'lambda_l2': [0, 10, 20, 30, 50],
    'max_depth': [5, 10, 15, 20],
    'subsample': [0.5, 0.75, 1]
}


new_model = lgb.LGBMClassifier(random_state=np.random.randint(1000))
rsCV = RandomizedSearchCV(new_model, param_distributions=parameters, n_iter=30,
                          scoring='roc_auc', random_state=np.random.randint(1000), n_jobs=6)


import time

start = time.time()

rsCV.fit(X_train, y_train)
print(time.time() - start)

[LightGBM] [Warning] lambda_l1 is set=0, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0
[LightGBM] [Warning] lambda_l2 is set=50, reg_lambda=0.0 will be ignored. Current value: lambda_l2=50
1051.4817752838135


rsCV.best_params_

{'subsample': 0.5,
 'n_estimators': 500,
 'max_depth': 20,
 'learning_rate': 0.25,
 'lambda_l2': 50,
 'lambda_l1': 0}


rsCV_proba = rsCV.predict_proba(X_test)


roc_auc_score(y_test, rsCV_proba[:, 1])

0.9964346064160456


rsCV_proba_int = (rsCV_proba[:, 1] > 0.2).astype('int')
confusion_matrix_view(y_test, rsCV_proba_int)

[[367768    820]
 [   358   1533]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.65      0.81      0.72      1891

    accuracy                           1.00    370479
   macro avg       0.83      0.90      0.86    370479
weighted avg       1.00      1.00      1.00    370479


confusion_matrix_view(y_test, proba_case1)

[[367529   1059]
 [   552   1339]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.56      0.71      0.62      1891

    accuracy                           1.00    370479
   macro avg       0.78      0.85      0.81    370479
weighted avg       1.00      1.00      1.00    370479


confusion_matrix_view(y_test, proba_case2)

[[368283    305]
 [   933    958]]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    368588
           1       0.76      0.51      0.61      1891

    accuracy                           1.00    370479
   macro avg       0.88      0.75      0.80    370479
weighted avg       1.00      1.00      1.00    370479


def get_X_y(df):
    X = df.drop(columns=['is_fraud'])
    y = df['is_fraud']

    return (X, y)


X, y = get_X_y(df)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=814
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=814
)


data_train = lgb.Dataset(X_train, label=y_train)
data_val = lgb.Dataset(X_val, label=y_val)

param_set = rsCV.best_params_


param_set

{'subsample': 0.5,
 'n_estimators': 500,
 'max_depth': 20,
 'learning_rate': 0.25,
 'lambda_l2': 50,
 'lambda_l1': 0,
 'metrics': 'auc'}


model_train = lgb.train(param_set, data_train, valid_sets=[data_val],
                        # early_stopping_rounds: 학습 시간 제한
                        # verbose_eval: 중간결과를 특정 간격으로 출력
                        early_stopping_rounds=100, verbose_eval=100)

C:\Users\EthanJ\anaconda3\lib\site-packages\lightgbm\engine.py:177: UserWarning: Found `n_estimators` in params. Will use it instead of argument
  _log_warning(f"Found `{alias}` in params. Will use it instead of argument")
C:\Users\EthanJ\anaconda3\lib\site-packages\lightgbm\engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
  _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
C:\Users\EthanJ\anaconda3\lib\site-packages\lightgbm\engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
  _log_warning("'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. "

[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1632
[LightGBM] [Info] Number of data points in the train set: 1185532, number of used features: 21
[LightGBM] [Warning] Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves. (num_leaves=31).
[LightGBM] [Info] Start training from score 0.005262
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.993644
Early stopping, best iteration is:
[97]	valid_0's auc: 0.994762


pred_train = model_train.predict(X_test)


roc_auc_score(y_test, pred_train)

0.9929069517179909


feature_imptnt = pd.DataFrame({'features': X_train.columns, 'values': model.feature_importances_})
plt.figure(figsize=(20, 10))

sns.barplot(x='values', y='features',
            data=feature_imptnt.sort_values(by='values', ascending=False).head(10))

plt.show()

	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	city	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long
0	2019-01-01 00:00:18	2703186189652095	fraud_Rippin, Kub and Mann	misc_net	4.97	Jennifer	Banks	F	561 Perry Cove	Moravian Falls	...	36.0788	-81.1781	3495	Psychologist, counselling	1988-03-09	0b242abb623afc578575680df30655b9	1325376018	36.011293	-82.048315
1	2019-01-01 00:00:44	630423337322	fraud_Heller, Gutmann and Zieme	grocery_pos	107.23	Stephanie	Gill	F	43039 Riley Greens Suite 393	Orient	...	48.8878	-118.2105	149	Special educational needs teacher	1978-06-21	1f76529f8574734946361c461b024d99	1325376044	49.159047	-118.186462
2	2019-01-01 00:00:51	38859492057661	fraud_Lind-Buckridge	entertainment	220.11	Edward	Sanchez	M	594 White Dale Suite 530	Malad City	...	42.1808	-112.2620	4154	Nature conservation officer	1962-01-19	a1a22d70485983eac12b5b88dad1cf95	1325376051	43.150704	-112.154481
3	2019-01-01 00:01:16	3534093764340240	fraud_Kutch, Hermiston and Farrell	gas_transport	45.00	Jeremy	White	M	9443 Cynthia Court Apt. 038	Boulder	...	46.2306	-112.1138	1939	Patent attorney	1967-01-12	6b849c168bdad6f867558c3793159a81	1325376076	47.034331	-112.561071
4	2019-01-01 00:03:06	375534208663984	fraud_Keeling-Crist	misc_pos	41.96	Tyler	Garcia	M	408 Bradley Rest	Doe Hill	...	38.4207	-79.4629	99	Dance movement psychotherapist	1986-03-28	a41d7549acf90789359a9aa5346dcb46	1325376186	38.674999	-78.632459

	cc_num	amt	zip	lat	long	city_pop	unix_time	merch_lat	merch_long	is_fraud
count	1852394.00	1852394.00	1852394.00	1852394.00	1852394.00	1852394.00	1852394.00	1852394.00	1852394.00	1852394.00
mean	417386038394006464.00	70.06	48813.26	38.54	-90.23	88643.67	1358674218.83	38.54	-90.23	0.01
std	1309115265318020352.00	159.25	26881.85	5.07	13.75	301487.62	18195081.39	5.11	13.76	0.07
min	60416207185.00	1.00	1257.00	20.03	-165.67	23.00	1325376018.00	19.03	-166.67	0.00
25%	180042946491150.00	9.64	26237.00	34.67	-96.80	741.00	1343016823.75	34.74	-96.90	0.00
50%	3521417320836166.00	47.45	48174.00	39.35	-87.48	2443.00	1357089331.00	39.37	-87.44	0.00
75%	4642255475285942.00	83.10	72042.00	41.94	-80.16	20328.00	1374581485.25	41.96	-80.25	0.00
max	4992346398065154048.00	28948.90	99921.00	66.69	-67.95	2906700.00	1388534374.00	67.51	-66.95	1.00

	cc_num	mean	std
0	60416207185	59.26	142.87
1	60422928733	65.48	92.04
2	60423098130	96.38	1000.69
3	60427851591	107.49	131.01
4	60487002085	64.10	153.21

	cc_num	amt	mean	std
0	2703186189652095	4.97	89.41	127.53
1	630423337322	107.23	56.08	159.20
2	38859492057661	220.11	69.92	116.69
3	3534093764340240	45.00	80.09	280.08
4	375534208663984	41.96	95.34	94.32

	cc_num	category	mean	std
0	60416207185	entertainment	51.84	65.49
1	60416207185	food_dining	26.74	46.38
2	60416207185	gas_transport	59.78	15.76
3	60416207185	grocery_net	52.15	17.69
4	60416207185	grocery_pos	101.56	21.89

Try to 개발자 EthanJ의 성장 로그

Try to 개발자 EthanJ의 성장 로그

scikit-learn Machine Learning LightGBM 사이킷런 머신러닝 LightGBM 본문

scikit-learn Machine Learning LightGBM 사이킷런 머신러닝 LightGBM

scikit-learn Machine Learning LightGBM
사이킷런 머신러닝 LightGBM

Data Collection¶

Data pre-processing¶

의미 없는 Data 제거¶

feature engineering¶

결제 금액¶

`category`별 `amt`(결제금액)¶

`distance`¶

`dob` > `age`¶

`get_dummies()`¶

`set_index()`¶

Training Model¶

Evaluating Model¶

Visualizing Data¶

`model.predict_proba()`¶

ROC 곡선 & AUC¶

ROC 곡선¶

민감도 (TPR)¶

특이도 (TNR)¶

AUC (Area Under the ROC Curve)¶

RandomGridSearch¶

L1 정규화, L2 정규화¶

`train()` method¶

'CS & DS > scikit-learn Machine Learning' 카테고리의 다른 글

티스토리툴바

	cc_num	mean	std
0	60416207185	73.53	28.70
1	60422928733	78.99	29.30
2	60423098130	77.83	28.19
3	60427851591	75.71	28.98
4	60487002085	79.44	28.77

	trans_date_trans_time	amt	city_pop	amt_z_score	cate_z_score	distance	distance_z_score	age	...	category_health_fitness	category_kids_pets	category_travel	gender_M
1852389	2020-12-31 23:59:07	43.77	519	-0.17	-0.05	77.03	0.05	55	...	1	0	0	1
1852390	2020-12-31 23:59:09	111.84	28739	0.36	1.16	100.02	0.69	22	...	0	1	0	1
1852391	2020-12-31 23:59:15	86.88	3684	-0.02	0.44	80.89	0.29	40	...	0	1	0	0
1852392	2020-12-31 23:59:24	7.99	129	-0.59	-0.01	53.06	-0.73	56	...	0	0	1	1
1852393	2020-12-31 23:59:34	38.13	116001	-0.15	-0.38	72.38	-0.16	28	...	0	0	0	1

	train	fit
test set	modeling - test set	test X
Data Set	DataFrame → 별도 format 변환	DataFrame, Series
Hyperparameter	default X	default O
scikit-learn 연동	X	O

scikit-learn Machine Learning K-Means Clustering 사이킷런 머신러닝 K-평균 군집화 (0)	2022.11.21
scikit-learn Machine Learning XGBoost 사이킷런 머신러닝 XGBoost (0)	2022.11.21
scikit-learn Machine Learning RandomForest 사이킷런 머신러닝 랜덤포레스트 (0)	2022.11.21
scikit-learn Machine Learning DecisionTree 사이킷런 머신러닝 결정트리 (0)	2022.11.21
scikit-learn Machine Learning Naive Bayes Kor NLP 사이킷런 머신러닝 나이브베이즈 한글 자연어처리 (0)	2022.11.21

« 2025/08 »
일	월	화	수	목	금	토
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30
31

Try to 개발자 EthanJ의 성장 로그

scikit-learn Machine Learning LightGBM 사이킷런 머신러닝 LightGBM 본문

scikit-learn Machine Learning LightGBM 사이킷런 머신러닝 LightGBM

scikit-learn Machine Learning LightGBM 사이킷런 머신러닝 LightGBM

Data Collection¶

Data pre-processing¶

의미 없는 Data 제거¶

feature engineering¶

결제 금액¶

category별 amt(결제금액)¶

distance¶

dob > age¶

get_dummies()¶

set_index()¶

Training Model¶

Evaluating Model¶

Visualizing Data¶

model.predict_proba()¶

ROC 곡선 & AUC¶

ROC 곡선¶

민감도 (TPR)¶

특이도 (TNR)¶

AUC (Area Under the ROC Curve)¶

RandomGridSearch¶

L1 정규화, L2 정규화¶

train() method¶

'CS & DS > scikit-learn Machine Learning' 카테고리의 다른 글

티스토리툴바

scikit-learn Machine Learning LightGBM
사이킷런 머신러닝 LightGBM

`category`별 `amt`(결제금액)¶

`distance`¶

`dob` > `age`¶

`get_dummies()`¶

`set_index()`¶

`model.predict_proba()`¶

`train()` method¶