import pandas as pd
import numpy as np


hier_sr = pd.Series(np.arange(10),
                   index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'c', 'd'],
                         [1, 2, 3, 1, 2, 1, 2, 3, 4, 1]])

hier_sr

a  1    0
   2    1
   3    2
b  1    3
   2    4
c  1    5
   2    6
   3    7
   4    8
d  1    9
dtype: int32


hier_sr.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('c', 1),
            ('c', 2),
            ('c', 3),
            ('c', 4),
            ('d', 1)],
           )


hier_sr['c']

1    5
2    6
3    7
4    8
dtype: int32


hier_sr['b':'d']

b  1    3
   2    4
c  1    5
   2    6
   3    7
   4    8
d  1    9
dtype: int32


hier_sr[:, 2]

a    1
b    4
c    6
dtype: int32


hier_sr['c'].loc[2:4]

2    6
3    7
4    8
dtype: int32


hier_sr

a  1    0
   2    1
   3    2
b  1    3
   2    4
c  1    5
   2    6
   3    7
   4    8
d  1    9
dtype: int32


unstk_df = hier_sr.unstack()

unstk_df


unstk_df.stack()

a  1    0.0
   2    1.0
   3    2.0
b  1    3.0
   2    4.0
c  1    5.0
   2    6.0
   3    7.0
   4    8.0
d  1    9.0
dtype: float64


area_df = pd.DataFrame(np.random.randint(1, 21, (20)).reshape(4,5),
                     index=[[2017, 2017, 2018, 2018],
                           ['a', 'b', 'a', 'b']],
                     columns=[['서울', '서울', '경기', '경기', '경기'],
                             ['강남', '잠실', '분당', '수원', '판교']])

area_df


area_df['서울']


area_df['서울']['강남']

2017  a    10
      b     9
2018  a    13
      b    11
Name: 강남, dtype: int32


area_df[('서울', '강남')]

2017  a    10
      b     9
2018  a    13
      b    11
Name: (서울, 강남), dtype: int32


area_df.loc[2017]


area_df.loc[2017].loc['a']

서울  강남    10
    잠실     8
경기  분당    10
    수원     7
    판교     6
Name: a, dtype: int32


area_df.loc[(2017, 'a')]

서울  강남    10
    잠실     8
경기  분당    10
    수원     7
    판교     6
Name: (2017, a), dtype: int32


area_df['경기'].loc[:, '분당':'수원']


area_df


area_df.unstack()


area_df.stack()


area_df.swaplevel(1, 0)


area_df.swaplevel(0, 1)


area_df.swaplevel(1, 0, axis=1)


st_sr = pd.Series([2, 3, 1, 7, 0], index=list('gacfd'))

st_sr

g    2
a    3
c    1
f    7
d    0
dtype: int64


st_sr.sort_index()

a    3
c    1
d    0
f    7
g    2
dtype: int64


st_sr.sort_index(ascending=False)

g    2
f    7
d    0
c    1
a    3
dtype: int64


st_sr

g    2
a    3
c    1
f    7
d    0
dtype: int64


st_sr.sort_values()

d    0
c    1
g    2
a    3
f    7
dtype: int64


st_sr.sort_values(ascending=False)

f    7
a    3
g    2
c    1
d    0
dtype: int64


np.random.seed(4)


sort_df = pd.DataFrame(np.random.randint(20, size=(4, 5)),
                      index=list('dbca'),
                      columns=list('CBADE'))

sort_df


sort_df.sort_index()


sort_df.sort_index(ascending=False)


sort_df.sort_index(axis=1)


sort_df.sort_index(axis=1, ascending=False)


sort_df.sort_index(axis=1, ascending=False).sort_index()


sort_df


sort_df.sort_values(axis=0, by='D')


sort_df.sort_values(by='A', ascending=False)


sort_df.sort_values(by='d', axis=1)


sort_df.sort_values(by='c', axis=1, ascending=False)


sort_df.sort_values(by=['E', 'D'], axis=0)


sort_df.sort_values(by=['a', 'c'], axis=1, ascending=[True, False])


columns_list = [[2016, 2016, 2017, 2017], ['영어', '수학', '영어', '수학']]

row_list = ['Kim', 'Park', 'Lee', 'Jung', 'Moon']

score_df1 = pd.DataFrame(np.random.randint(50, 100, size=(5,4)),
                         columns=columns_list,
                         index=row_list)

score_df1


score_df1.columns.rename(['년도', '과목'], inplace=True)
score_df1.index.rename('학생명', inplace=True)

score_df1


columns_list_tuple = [(2016, '영어'), (2016, '수학'), (2017, '영어'), (2017, '수학')]
columns_index = pd.MultiIndex.from_tuples(columns_list_tuple, names=['년도', '과목'])

row_index = pd.Index(['Kim', 'Park', 'Lee', 'Jung', 'Moon'], name='학생명')
score_df2 = pd.DataFrame(np.random.randint(50, 100, size=(5, 4)),
                        columns=columns_index,
                        index=row_index)

score_df2


columns_level_list = [[2016, 2017], ['영어', '수학']]
columns_index = pd.MultiIndex.from_product(columns_level_list, names=['년도', '과목'])
print(columns_index)

row_index = pd.Index(['Kim', 'Park', 'Lee', 'Jung', 'Moon'], name='학생명')
score_df = pd.DataFrame(np.random.randint(50, 100, size=(5, 4)),
                       columns=columns_index,
                       index=row_index)

score_df

MultiIndex([(2016, '영어'),
            (2016, '수학'),
            (2017, '영어'),
            (2017, '수학')],
           names=['년도', '과목'])


df_2016 = score_df[2016].copy()

df_2016


df_2016.sort_index()


df_2016.sort_values(by=['영어', '수학'], ascending=[False, True])


df_name = pd.DataFrame({'no':[30, 31, 32, 33, 34],
                        'name':["김파썬", "이장고", "박판다", "최넘파", "강주피"]})

df_name


df_amt = pd.DataFrame({'no':[30, 32, 33, 40, 41],
                       'amount':[100, 40, 130, 40, 60]})

df_amt


pd.merge(df_name, df_amt)


pd.merge(df_name, df_amt, how='outer')


pd.merge(df_name, df_amt, how='left')


pd.merge(df_name, df_amt, how='right')


df_date_phone = pd.DataFrame({'고객명':['김파이썬', '이장고', '박팬더스'],
                   '날짜':['2022-10-22', '2022-10-23', '2022-10-24'],
                   '정보':['010', '011', '019']})
df_date_phone


df_sex = pd.DataFrame({'고객명':['김파이썬', '박팬더스', '최넘파이'],
                   '정보':['F', 'M', 'M']})
df_sex


pd.merge(df_date_phone, df_sex)


pd.merge(df_date_phone, df_sex, on='고객명')


df_date_price = pd.DataFrame({'고객이름':['김파이썬', '박팬더스', '강주피터'],
                   '날짜':['2020-01-01', '2020-02-01', '2020-02-15'],
                   '구매금액':[1, 2, 3]})
df_date_price


df_sex = pd.DataFrame({'고객명':['김파이썬', '박팬더스'],
                   '성별':['F', 'M']})
df_sex


df_client = pd.merge(df_date_price, df_sex, left_on='고객이름', right_on='고객명')

df_client


df_client = df_client.drop('고객명', axis=1)

df_client


sr_1 = pd.Series([1, 2, 3], index=list('abc'))
sr_2 = pd.Series([5, 6, 7, 8], index=list('acef'))

print(sr_1)
print(sr_2)

a    1
b    2
c    3
dtype: int64
a    5
c    6
e    7
f    8
dtype: int64


pd.concat([sr_1, sr_2])

a    1
b    2
c    3
a    5
c    6
e    7
f    8
dtype: int64


pd.concat([sr_1, sr_2], ignore_index=True)

0    1
1    2
2    3
3    5
4    6
5    7
6    8
dtype: int64


print(sr_1, sr_2)

a    1
b    2
c    3
dtype: int64 a    5
c    6
e    7
f    8
dtype: int64


pd.concat([sr_1, sr_2], axis=1)


pd.concat([sr_1, sr_2], axis=1, keys=['c1', 'c2'], sort=False)


df_date_price = pd.DataFrame({'고객명':['김파이썬', '이장고', '박팬더스'],
                   '날짜':['2022-10-22', '2022-10-23', '2022-12-14'],
                   '구매금액':[1, 2, 3]})
df_date_price


df_sex = pd.DataFrame({'고객명':['김파이썬', '최넘파이'],
                   '성별':['F', 'M']})
df_sex


pd.concat([df_date_price, df_sex], axis=0)


pd.concat([df_date_price, df_sex], axis=1)


pop_data = pd.read_excel('data/인구수예제.xlsx')

pop_data


pop_data.shape

(50, 6)


pop_data.head()


pop_data.tail()


pop_data.head(10)


pop_data.groupby('자치구')[['남자인구', '여자인구']].sum()


pop_data.groupby('도시')[['남자인구', '여자인구']].sum()


pop_data.groupby(['연도', '도시']).sum()


pop_data.groupby(['도시', '연도'])[['총인구']].mean()


pop_data.groupby(['도시', '연도'])[['총인구']].mean().unstack()


pop_data.groupby(['도시', '연도'])[['총인구']].mean().unstack(0)


my_index = pd.Index(['Kim', 'Park', 'Lee', 'Jung', 'Moon'], name='학생명')
my_columns = pd.MultiIndex.from_product([[2016, 2017], ['영어', '수학']],
                                       names=['연도', '과목'])

df_score = pd.DataFrame(np.random.randint(50, 100, (5, 4)),
                  index=my_index, columns=my_columns)
df_score


df_score.describe()


df_2017 = df_score[[2017]].copy()

df_2017


df_2017.shape

(5, 2)


df_2017.count()

연도    과목
2017  영어    5
      수학    5
dtype: int64


df_2017.count(axis=1)

학생명
Kim     2
Park    2
Lee     2
Jung    2
Moon    2
dtype: int64


pop_data = pd.read_excel('data/인구수예제.xlsx')

pop_data


pop_data['도시'].value_counts()

서울    35
부산    15
Name: 도시, dtype: int64


# axis=0: 과목별 총합
df_2017.sum()

연도    과목
2017  영어    357
      수학    376
dtype: int64


# axis=1: 학생별 총합
df_2017.sum(axis=1)

학생명
Kim     107
Park    172
Lee     166
Jung    160
Moon    128
dtype: int64


# default: axis=0: 과목별 평균
df_2017.mean()

연도    과목
2017  영어    71.4
      수학    75.2
dtype: float64


# axis=1: 학생별 평균
df_2017.mean(axis=1)

학생명
Kim     53.5
Park    86.0
Lee     83.0
Jung    80.0
Moon    64.0
dtype: float64


df_score


df_mean = df_score[[2016, 2017]].mean(axis=0)

df_mean

연도    과목
2016  영어    77.2
      수학    68.4
2017  영어    71.4
      수학    75.2
dtype: float64


df_mean.unstack(1)


df_score.mean(axis=1)

학생명
Kim     55.00
Park    79.25
Lee     78.00
Jung    79.75
Moon    73.25
dtype: float64


str_date = ['2022/11/01', '2022.10.31', '2021-10-09']


# Series로 저장
pd.Series(str_date)

0    2022/11/01
1    2022.10.31
2    2021-10-09
dtype: object


pd.to_datetime(str_date)

DatetimeIndex(['2022-11-01', '2022-10-31', '2021-10-09'], dtype='datetime64[ns]', freq=None)


type(pd.to_datetime(str_date))

pandas.core.indexes.datetimes.DatetimeIndex


pd.Series(str_date).astype('datetime64')

0   2022-11-01
1   2022-10-31
2   2021-10-09
dtype: datetime64[ns]


stamp_date = [1234000, 1256000, 1278000, 1290000, 1234567]

pd.Series(stamp_date)

0    1234000
1    1256000
2    1278000
3    1290000
4    1234567
dtype: int64


pd.Series(pd.to_datetime(stamp_date))

0   1970-01-01 00:00:00.001234000
1   1970-01-01 00:00:00.001256000
2   1970-01-01 00:00:00.001278000
3   1970-01-01 00:00:00.001290000
4   1970-01-01 00:00:00.001234567
dtype: datetime64[ns]


pd.Series(pd.to_datetime(stamp_date, unit='s'))

0   1970-01-15 06:46:40
1   1970-01-15 12:53:20
2   1970-01-15 19:00:00
3   1970-01-15 22:20:00
4   1970-01-15 06:56:07
dtype: datetime64[ns]


pd.Series(pd.to_datetime(stamp_date, unit='D'))

---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\_libs\tslib.pyx in pandas._libs.tslib.array_with_unit_to_datetime()

~\anaconda3\lib\site-packages\pandas\_libs\tslibs\conversion.pyx in pandas._libs.tslibs.conversion.cast_from_unit()

OverflowError: int too big to convert

During handling of the above exception, another exception occurred:

OutOfBoundsDatetime                       Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_596\2430875186.py in <module>
----> 1 pd.Series(pd.to_datetime(stamp_date, unit='D'))

~\anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)
   1074             result = _convert_and_box_cache(arg, cache_array)
   1075         else:
-> 1076             result = convert_listlike(arg, format)
   1077     else:
   1078         result = convert_listlike(np.array([arg]), format)[0]

~\anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _convert_listlike_datetimes(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)
    355         if format is not None:
    356             raise ValueError("cannot specify both format and unit")
--> 357         return _to_datetime_with_unit(arg, unit, name, tz, errors)
    358     elif getattr(arg, "ndim", 1) > 1:
    359         raise TypeError(

~\anaconda3\lib\site-packages\pandas\core\tools\datetimes.py in _to_datetime_with_unit(arg, unit, name, tz, errors)
    528     else:
    529         arg = np.asarray(arg)
--> 530         arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
    531 
    532     if errors == "ignore":

~\anaconda3\lib\site-packages\pandas\_libs\tslib.pyx in pandas._libs.tslib.array_with_unit_to_datetime()

OutOfBoundsDatetime: cannot convert input 1234000 with the unit 'D'


pd.Series(pd.to_datetime(stamp_date, unit='ms'))

0   1970-01-01 00:20:34.000
1   1970-01-01 00:20:56.000
2   1970-01-01 00:21:18.000
3   1970-01-01 00:21:30.000
4   1970-01-01 00:20:34.567
dtype: datetime64[ns]


pd.Series(pd.to_datetime(stamp_date, unit='us'))

0   1970-01-01 00:00:01.234000
1   1970-01-01 00:00:01.256000
2   1970-01-01 00:00:01.278000
3   1970-01-01 00:00:01.290000
4   1970-01-01 00:00:01.234567
dtype: datetime64[ns]


gender_df = pd.DataFrame({'gender':[0, 1, 0, 0, 0, 1]})
gender_map = {0:'Male', 1:'Female'}

gender_df


gender_df['gender'].map(gender_map)

0      Male
1    Female
2      Male
3      Male
4      Male
5    Female
Name: gender, dtype: object


gender_df['gender'].replace(0, "Male").replace(1, "Female")

0      Male
1    Female
2      Male
3      Male
4      Male
5    Female
Name: gender, dtype: object


gender_df['gender'].replace([0, 1], ['Male', 'Female'])

0      Male
1    Female
2      Male
3      Male
4      Male
5    Female
Name: gender, dtype: object


gender_df['gender'].replace(gender_map)

0      Male
1    Female
2      Male
3      Male
4      Male
5    Female
Name: gender, dtype: object


my_data = {'Name':['Jane', 'Albert', 'John'],
          'Age':[18, 19, 21]}

my_df = pd.DataFrame(my_data)
my_df


my_df1 = my_df.copy()

new_cols = list()

for col in my_df.columns:
    print(col.lower())
    new_cols.append(col.lower())

name
age


new_cols

['name', 'age']


my_df1.columns = new_cols

my_df1


my_df2 = my_df.copy()

my_df2.columns = my_df2.columns.str.upper()
my_df2


def change_lower(value):
    return value.lower()


my_df2['NAME'].apply(change_lower)

0      jane
1    albert
2      john
Name: NAME, dtype: object


my_df2


my_df2['NAME'].map(change_lower)

0      jane
1    albert
2      john
Name: NAME, dtype: object


def is_adult(value: int) -> str :
    if value >= 20:
        return "성인"
    else:
        return "미성년자"


my_df2['AGE'].map(is_adult)

0    미성년자
1    미성년자
2      성인
Name: AGE, dtype: object


this_sample = pd.read_csv('data/csv_exam_nan.csv')

this_sample


this_sample.dropna()


this_sample.dropna(how='all')


this_sample.isnull()


this_sample.isnull().any()

math       True
english    True
science    True
dtype: bool


this_sample.isnull().any(axis=1)

0     True
1    False
2     True
3     True
4    False
5    False
dtype: bool


this_sample.isnull().sum()

math       1
english    2
science    3
dtype: int64


this_sample[this_sample.isnull().any(axis=1)]


sample = this_sample

sample.fillna(0)


sample.mean()

math       76.00
english    87.25
science    84.00
dtype: float64


tot_avg = sample.fillna(0).values.mean()

sample.fillna(tot_avg)


print(sample.mean()[0])
print(sample.mean()[1])
print(sample.mean()[2])

76.0
87.25
84.0


# 컬럼 지정 후 평균 구하기
sample['math'].mean()

76.0


sample.columns

Index(['math', 'english', 'science'], dtype='object')


for col in sample.columns:
    print(sample[col].fillna(sample[col].mean()))

0    70.0
1    75.0
2    76.0
3    56.0
4    89.0
5    90.0
Name: math, dtype: float64
0     87.25
1     65.00
2     87.25
3     89.00
4     95.00
5    100.00
Name: english, dtype: float64
0    84.0
1    80.0
2    84.0
3    84.0
4    83.0
5    89.0
Name: science, dtype: float64


sample_1dim = pd.Series(sample.values.reshape(sample.size))

sample_1dim

0      70.0
1       NaN
2       NaN
3      75.0
4      65.0
5      80.0
6       NaN
7       NaN
8       NaN
9      56.0
10     89.0
11      NaN
12     89.0
13     95.0
14     83.0
15     90.0
16    100.0
17     89.0
dtype: float64


sample_median = sample_1dim.median()

sample_median

86.0


sample.fillna(sample_median)


df = pd.DataFrame({'label':['A', 'B', 'B', 'C', 'C', 'C', 'D']})
df


df.describe()


df.label.value_counts()

C    3
B    2
A    1
D    1
Name: label, dtype: int64


from collections import Counter


colors = ['red', 'blue', 'pink', 'blue', 'blue', 'red']

counter = Counter(colors)
counter

Counter({'red': 2, 'blue': 3, 'pink': 1})


counter.most_common()

[('blue', 3), ('red', 2), ('pink', 1)]


Counter(df.label).most_common()

[('C', 3), ('B', 2), ('A', 1), ('D', 1)]


from sklearn.preprocessing import scale, minmax_scale


arr_x = (np.arange(9)-3).reshape(-1, 1)

arr_x

array([[-3],
       [-2],
       [-1],
       [ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5]])


type(arr_x)

numpy.ndarray


scale(arr_x)

array([[-1.54919334],
       [-1.161895  ],
       [-0.77459667],
       [-0.38729833],
       [ 0.        ],
       [ 0.38729833],
       [ 0.77459667],
       [ 1.161895  ],
       [ 1.54919334]])


minmax_scale(arr_x)

array([[0.   ],
       [0.125],
       [0.25 ],
       [0.375],
       [0.5  ],
       [0.625],
       [0.75 ],
       [0.875],
       [1.   ]])


my_df = pd.DataFrame(np.hstack([arr_x, scale(arr_x), minmax_scale(arr_x)]),
                     columns=['arr_x', 'scale', 'minmax_scale'])

my_df


my_df.std()

arr_x           2.738613
scale           1.060660
minmax_scale    0.342327
dtype: float64

Try to 개발자 EthanJ의 성장 로그

Try to 개발자 EthanJ의 성장 로그

Pandas Data pre-processing 판다스 데이터 전처리 본문

Pandas Data pre-processing 판다스 데이터 전처리

Pandas Data pre-processing
판다스 데이터 전처리

1. Hierarchical Indexing 계층 색인¶

1.1. Series Hierarchical Indexing 시리즈 계층 색인¶

1.2. `obj.unstack()`¶

1.3. `obj.stack()`¶

1.4. `DataFrame` 계층 색인¶

1.5. `df.swaplevel()`¶

2. Sorting 정렬¶

> 연습문제¶

3. Data merge 데이터 합병¶

3.1. parameter `how='inner'`¶

3.2. `on`, `left_on`, `right_on`¶

4. Data Concatenate 데이터 연결 `pd.concat()`¶

> 연습문제¶

5. Data 집계¶

5.1. `df.groupby(column name)`¶

6.통계¶

> 연습문제¶

7. 날짜 형식¶

7.1. 날짜 데이터: str 타입¶

7.2. 날짜 데이터: `timestamp` 타입¶

8. label 형식 통일¶

9. 문자 형식(대소문자, 기호 등) 통일¶

10. data value에 대한 처리¶

10.1. 결측치 처리 - 삭제, 선택¶

10.2. 결측치 처리 - 대체값¶

11. 데이터 단위 통일¶

11.1 표준화(Standardization)¶

'CS & DS > Numpy & Pandas' 카테고리의 다른 글

티스토리툴바

		서울		경기
		강남	잠실	분당	수원	판교
2017	a	10	8	10	7	6
2017	b	9	4	10	9	19
2018	a	13	9	1	12	16
2018	b	11	14	16	14	19

			경기	서울
2017	a	강남	NaN	10.0
		분당	10.0	NaN
		수원	7.0	NaN
		잠실	NaN	8.0
		판교	6.0	NaN
	b	강남	NaN	9.0
		분당	10.0	NaN
		수원	9.0	NaN
		잠실	NaN	4.0
		판교	19.0	NaN
2018	a	강남	NaN	13.0
		분당	1.0	NaN
		수원	12.0	NaN
		잠실	NaN	9.0
		판교	16.0	NaN
	b	강남	NaN	11.0
		분당	16.0	NaN
		수원	14.0	NaN
		잠실	NaN	14.0
		판교	19.0	NaN

		강남	잠실	분당	수원	판교
		서울	서울	경기	경기	경기
2017	a	10	8	10	7	6
2017	b	9	4	10	9	19
2018	a	13	9	1	12	16
2018	b	11	14	16	14	19

	2016		2017
	영어	수학	영어	수학
Kim	52	96	80	58
Park	99	52	73	82
Lee	90	92	95	83
Jung	82	95	78	53
Moon	65	84	81	67

년도	2016		2017
과목	영어	수학	영어	수학
학생명
Kim	52	96	80	58
Park	99	52	73	82
Lee	90	92	95	83
Jung	82	95	78	53
Moon	65	84	81	67

	no	name
0	30	김파썬
1	31	이장고
2	32	박판다
3	33	최넘파
4	34	강주피

	no	name	amount
0	30	김파썬	100.0
1	31	이장고	NaN
2	32	박판다	40.0
3	33	최넘파	130.0
4	34	강주피	NaN
5	40	NaN	40.0
6	41	NaN	60.0

	고객명	날짜	정보
0	김파이썬	2022-10-22	010
1	이장고	2022-10-23	011
2	박팬더스	2022-10-24	019

	고객이름	날짜	구매금액
0	김파이썬	2020-01-01	1
1	박팬더스	2020-02-01	2
2	강주피터	2020-02-15	3

	도시	자치구	연도	남자인구	여자인구	총인구
0	서울	강남구	2013	73	92	165
1	서울	강남구	2014	139	55	194
2	서울	강남구	2015	123	83	206
3	서울	강남구	2016	147	150	297
4	서울	강남구	2017	57	133	190
5	서울	서대문구	2013	95	111	206
6	서울	서대문구	2014	149	150	299
7	서울	서대문구	2015	106	77	183
8	서울	서대문구	2016	56	109	165
9	서울	서대문구	2017	82	96	178
10	서울	종로구	2013	121	68	189
11	서울	종로구	2014	107	55	162
12	서울	종로구	2015	50	79	129
13	서울	종로구	2016	100	80	180
14	서울	종로구	2017	105	91	196
15	서울	영등포구	2013	146	113	259
16	서울	영등포구	2014	127	117	244
17	서울	영등포구	2015	70	72	142
18	서울	영등포구	2016	141	136	277
19	서울	영등포구	2017	145	124	269
20	서울	송파구	2013	90	130	220
21	서울	송파구	2014	121	66	187
22	서울	송파구	2015	62	121	183
23	서울	송파구	2016	80	92	172
24	서울	송파구	2017	62	150	212
25	서울	도봉구	2013	113	138	251
26	서울	도봉구	2014	145	140	285
27	서울	도봉구	2015	56	139	195
28	서울	도봉구	2016	60	71	131
29	서울	도봉구	2017	111	62	173
30	서울	동작구	2013	120	117	237
31	서울	동작구	2014	94	108	202
32	서울	동작구	2015	74	139	213
33	서울	동작구	2016	87	84	171
34	서울	동작구	2017	79	134	213
35	부산	해운대구	2013	124	103	227
36	부산	해운대구	2014	101	144	245
37	부산	해운대구	2015	115	70	185
38	부산	해운대구	2016	134	126	260
39	부산	해운대구	2017	146	72	218
40	부산	수영구	2013	134	94	228
41	부산	수영구	2014	74	138	212
42	부산	수영구	2015	69	81	150
43	부산	수영구	2016	81	148	229
44	부산	수영구	2017	144	98	242
45	부산	동래구	2013	83	65	148
46	부산	동래구	2014	139	87	226
47	부산	동래구	2015	147	115	262
48	부산	동래구	2016	61	102	163
49	부산	동래구	2017	132	105	237

« 2025/10 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

	math	english	science
0	70.0	NaN	NaN
1	75.0	65.0	80.0
2	NaN	NaN	NaN
3	56.0	89.0	NaN
4	89.0	95.0	83.0
5	90.0	100.0	89.0

	math	english	science
0	False	True	True
1	False	False	False
2	True	True	True
3	False	False	True
4	False	False	False
5	False	False	False

	math	english	science
0	70.0	54.5	54.5
1	75.0	65.0	80.0
2	54.5	54.5	54.5
3	56.0	89.0	54.5
4	89.0	95.0	83.0
5	90.0	100.0	89.0

	math	english	science
0	70.0	86.0	86.0
1	75.0	65.0	80.0
2	86.0	86.0	86.0
3	56.0	89.0	86.0
4	89.0	95.0	83.0
5	90.0	100.0	89.0

	arr_x	scale	minmax_scale
0	-3.0	-1.549193	0.000
1	-2.0	-1.161895	0.125
2	-1.0	-0.774597	0.250
3	0.0	-0.387298	0.375
4	1.0	0.000000	0.500
5	2.0	0.387298	0.625
6	3.0	0.774597	0.750
7	4.0	1.161895	0.875
8	5.0	1.549193	1.000

Pandas Data visualization with matplotlib 판다스 데이터 시각화 (0)	2022.11.12
Pandas Data analysis with Baseball player 판다스 야구 선수 데이터 분석 (0)	2022.11.12
Pandas Data Loading 판다스 데이터 적재 (0)	2022.11.05
Pandas DataFrame 판다스 데이터프레임 (0)	2022.11.05
Pandas Series 판다스 시리즈 (0)	2022.11.03

Try to 개발자 EthanJ의 성장 로그

Pandas Data pre-processing 판다스 데이터 전처리 본문

Pandas Data pre-processing 판다스 데이터 전처리

Pandas Data pre-processing 판다스 데이터 전처리

1. Hierarchical Indexing 계층 색인¶

1.1. Series Hierarchical Indexing 시리즈 계층 색인¶

1.2. obj.unstack()¶

1.3. obj.stack()¶

1.4. DataFrame 계층 색인¶

1.5. df.swaplevel()¶

2. Sorting 정렬¶

> 연습문제¶

3. Data merge 데이터 합병¶

3.1. parameter how='inner'¶

3.2. on, left_on, right_on¶

4. Data Concatenate 데이터 연결 pd.concat()¶

> 연습문제¶

5. Data 집계¶

5.1. df.groupby(column name)¶

6.통계¶

> 연습문제¶

7. 날짜 형식¶

7.1. 날짜 데이터: str 타입¶

7.2. 날짜 데이터: timestamp 타입¶

8. label 형식 통일¶

9. 문자 형식(대소문자, 기호 등) 통일¶

10. data value에 대한 처리¶

10.1. 결측치 처리 - 삭제, 선택¶

10.2. 결측치 처리 - 대체값¶

11. 데이터 단위 통일¶

11.1 표준화(Standardization)¶

'CS & DS > Numpy & Pandas' 카테고리의 다른 글

티스토리툴바

Pandas Data pre-processing
판다스 데이터 전처리

1.2. `obj.unstack()`¶

1.3. `obj.stack()`¶

1.4. `DataFrame` 계층 색인¶

1.5. `df.swaplevel()`¶

3.1. parameter `how='inner'`¶

3.2. `on`, `left_on`, `right_on`¶

4. Data Concatenate 데이터 연결 `pd.concat()`¶

5.1. `df.groupby(column name)`¶

7.2. 날짜 데이터: `timestamp` 타입¶