λ³Έλ¬Έ λ°”λ‘œκ°€κΈ°

Python

[Python] Pandas data 처리

λ°˜μ‘ν˜•

πŸ– λˆ„λ½ 데이터 확인

import seaborn as sns

df = sns.load_dataset('titanic')
df.head()

df.info()

nan_deck = df['deck'].value_counts(dropna=False)
print(nan_deck)

πŸ‘‰ 'deck' 열에 NaN 값이 μžˆμŒμ„ 확인  /  'deck' 열에 μžˆλŠ” λˆ„λ½ 데이터가 688개 λΌλŠ” 사싀을 μ•Œ 수 있음

 

πŸ– λˆ„λ½ 데이터 제거

import seaborn as sns

df = sns.load_dataset('titanic')

missing_df = df.isnull()
for col in missing_df.columns:
    missing_count = missing_df[col].value_counts()
    try:
        print(col, ': ', missing_count[True])
    except:
        print(col, ': ', 0)

# NaN 값이 500개 이상인 열을 λͺ¨λ‘ μ‚­μ œ
df_thresh = df.dropna(axis=1, thresh=500)
print(df_thresh.columns)

# age 열에 λ‚˜μ΄ 데이터가 μ—†λŠ” λͺ¨λ“  ν–‰ μ‚­μ œ
df_age = df.dropna(subset=['age'], how='any', axis=0)
print(len(df_age))

πŸ– λˆ„λ½ 데이터 μΉ˜ν™˜

# ν‰κ· μœΌλ‘œ λˆ„λ½ 데이터 λ°”κΎΈκΈ°
print(df['age'].head(10))
print('\n')

mean_age = df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace=True)
print(df['age'].head(10))

# κ°€μž₯ 많이 λ‚˜νƒ€λ‚˜λŠ” κ°’μœΌλ‘œ λ°”κΎΈκΈ°
print(df['embark_town'][825:830])
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_freq)
df['embark_town'].fillna(most_freq, inplace=True)
print(df['embark_town'][825:830])

# μ΄μ›ƒν•˜κ³  μžˆλŠ” κ°’μœΌλ‘œ λ°”κΎΈκΈ°
import seaborn as sns

df = sns.load_dataset('titanic')

print(df['embark_town'][825:830])

df['embark_town'].fillna(method='ffill', inplace=True)
print(df['embark_town'][825:830])

mean / most_freq / ffill

πŸ– 쀑볡 데이터 처리

import pandas as pd

df = pd.DataFrame({'c1':['a','a','b','a','b'],
                   'c2':[1,1,1,2,2],
                   'c3':[1,1,2,2,2]})
print(df)

# 쀑볡 데이터 확인
df_dup = df.duplicated()
print(df_dup)

col_dup = df['c2'].duplicated()
print(col_dup)

# 쀑볡 데이터 제거
df2 = df.drop_duplicates()
print(df2)

πŸ– μžλ£Œν˜• λ³€ν™˜

import pandas as pd

df = pd.read_csv('/kaggle/input/autompg-dataset/auto-mpg.csv')
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','name']
print(df.dtypes)
print(df['horsepower'].unique())

2번째 쀄에 λ¬Έμžμ—΄ '?' λ₯Ό NaNκ°’μœΌλ‘œ λ³€ν™˜ ν•„μš”

# λˆ„λ½ 데이터 μ‚­μ œ
import numpy as np
df['horsepower'].replace('?', np.nan, inplace=True)      # '?'을 np.nan으둜 λ³€κ²½
df.dropna(subset=['horsepower'], axis=0, inplace=True)   # λˆ„λ½ 데이터 ν–‰ μ‚­μ œ
df['horsepower'] = df['horsepower'].astype('float')      # λ¬Έμžν˜•μ„ μ‹€μˆ˜ν˜•μœΌλ‘œ λ³€ν™˜

print(df['horsepower'].dtypes)

# float64

# μ •μˆ˜ν˜• 데이터λ₯Ό λ¬Έμžμ—΄ λ°μ΄ν„°λ‘œ λ³€ν™˜
print(df['origin'].unique())
df['origin'].replace({1:'USA', 2:'EU', 3:'JPN'}, inplace=True)

print(df['origin'].unique())
print(df['origin'].dtypes)

# [1 3 2]
# ['USA' 'JPN' 'EU']
# object

πŸ– 필터링

import seaborn as sns

titanic = sns.load_dataset('titanic')

mask1 = (titanic.age >= 10) & (titanic.age < 20)
df_teenage = titanic.loc[mask1, :]
print(df_teenage)

필터링 된 데이터λ₯Ό 톡해 λͺ¨λ“  승객이 10λŒ€ μž„μ„ 확인

import seaborn as sns

titanic = sns.load_dataset('titanic')

mask2 = (titanic.age < 10) & (titanic.sex == 'female')
df_female_under10 = titanic.loc[mask2, :]
print(df_female_under10.head())

import seaborn as sns
import pandas as pd

titanic = sns.load_dataset('titanic')

pd.set_option('display.max_columns',10)   # 좜λ ₯ν•  μ΅œλŒ€ μ—΄μ˜ 개수 μ„€μ •

mask3 = titanic['sibsp'] == 3             # ν•¨κ»˜ νƒ‘μŠΉν•œ ν˜•μ œ λ˜λŠ” 배우자 수 3, 4, 5
mask4 = titanic['sibsp'] == 4
mask5 = titanic['sibsp'] == 5

df_boolean = titanic[mask3 | mask4 | mask5]
print(df_boolean.head())

πŸ– λ°μ΄ν„°ν”„λ ˆμž„ μ—°κ²°

import pandas as pd

df1 = pd.DataFrame({'a':['a0','a1','a2','a3'],
                    'b':['b0','b1','b2','b3'],
                    'c':['c0','c1','c2','c3']},
                    index=[0,1,2,3])

df2 = pd.DataFrame({'a':['a2','a3','a4','a5'],
                    'b':['b2','b3','b4','b5'],
                    'c':['c2','c3','c4','b5'],
                    'd':['d2','d3','d4','d5']},
                    index=[2,3,4,5])

print(df1, '\n')
print(df2, '\n')

result1 = pd.concat([df1, df2])
print(result1, '\n')

result2 = pd.concat([df1, df2], ignore_index=True)
print(result2, '\n')

πŸ‘‰ df1, df2 μ—°κ²°ν•˜λ©΄ df1의 0,1,2,3 ν–‰μ—λŠ” 'd' 열이 μ—†κΈ° λ•Œλ¬Έμ— NaN 으둜 μž…λ ₯

πŸ‘‰ ignore_index μ˜΅μ…˜ μΆ”κ°€ μ‹œ, κΈ°μ‘΄ ν–‰ 인덱슀λ₯Ό λ¬΄μ‹œν•˜κ³  μƒˆλ‘œμš΄ ν–‰ 인덱슀λ₯Ό μ„€μ •

# 2개의 dataframe을 쒌우 μ—΄ λ°©ν–₯으둜 이어 뢙이듯 μ—°κ²°ν•˜κΈ°
result3 = pd.concat([df1, df2], axis=1)
print(result3, '\n')

# join='inner' μ˜΅μ…˜ 적용 (ꡐ집합)
result3_in = pd.concat([df1, df2], axis=1, join='inner')
print(result3_in, '\n')

 

λ°˜μ‘ν˜•