λ°μν
π λλ½ λ°μ΄ν° νμΈ
import seaborn as sns
df = sns.load_dataset('titanic')
df.head()
df.info()
nan_deck = df['deck'].value_counts(dropna=False)
print(nan_deck)
π 'deck' μ΄μ NaN κ°μ΄ μμμ νμΈ / 'deck' μ΄μ μλ λλ½ λ°μ΄ν°κ° 688κ° λΌλ μ¬μ€μ μ μ μμ
π λλ½ λ°μ΄ν° μ κ±°
import seaborn as sns
df = sns.load_dataset('titanic')
missing_df = df.isnull()
for col in missing_df.columns:
missing_count = missing_df[col].value_counts()
try:
print(col, ': ', missing_count[True])
except:
print(col, ': ', 0)
# NaN κ°μ΄ 500κ° μ΄μμΈ μ΄μ λͺ¨λ μμ
df_thresh = df.dropna(axis=1, thresh=500)
print(df_thresh.columns)
# age μ΄μ λμ΄ λ°μ΄ν°κ° μλ λͺ¨λ ν μμ
df_age = df.dropna(subset=['age'], how='any', axis=0)
print(len(df_age))
π λλ½ λ°μ΄ν° μΉν
# νκ· μΌλ‘ λλ½ λ°μ΄ν° λ°κΎΈκΈ°
print(df['age'].head(10))
print('\n')
mean_age = df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace=True)
print(df['age'].head(10))
# κ°μ₯ λ§μ΄ λνλλ κ°μΌλ‘ λ°κΎΈκΈ°
print(df['embark_town'][825:830])
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_freq)
df['embark_town'].fillna(most_freq, inplace=True)
print(df['embark_town'][825:830])
# μ΄μνκ³ μλ κ°μΌλ‘ λ°κΎΈκΈ°
import seaborn as sns
df = sns.load_dataset('titanic')
print(df['embark_town'][825:830])
df['embark_town'].fillna(method='ffill', inplace=True)
print(df['embark_town'][825:830])
π μ€λ³΅ λ°μ΄ν° μ²λ¦¬
import pandas as pd
df = pd.DataFrame({'c1':['a','a','b','a','b'],
'c2':[1,1,1,2,2],
'c3':[1,1,2,2,2]})
print(df)
# μ€λ³΅ λ°μ΄ν° νμΈ
df_dup = df.duplicated()
print(df_dup)
col_dup = df['c2'].duplicated()
print(col_dup)
# μ€λ³΅ λ°μ΄ν° μ κ±°
df2 = df.drop_duplicates()
print(df2)
π μλ£ν λ³ν
import pandas as pd
df = pd.read_csv('/kaggle/input/autompg-dataset/auto-mpg.csv')
df.columns = ['mpg','cylinders','displacement','horsepower','weight','acceleration','model year','origin','name']
print(df.dtypes)
print(df['horsepower'].unique())
# λλ½ λ°μ΄ν° μμ
import numpy as np
df['horsepower'].replace('?', np.nan, inplace=True) # '?'μ np.nanμΌλ‘ λ³κ²½
df.dropna(subset=['horsepower'], axis=0, inplace=True) # λλ½ λ°μ΄ν° ν μμ
df['horsepower'] = df['horsepower'].astype('float') # λ¬Έμνμ μ€μνμΌλ‘ λ³ν
print(df['horsepower'].dtypes)
# float64
# μ μν λ°μ΄ν°λ₯Ό λ¬Έμμ΄ λ°μ΄ν°λ‘ λ³ν
print(df['origin'].unique())
df['origin'].replace({1:'USA', 2:'EU', 3:'JPN'}, inplace=True)
print(df['origin'].unique())
print(df['origin'].dtypes)
# [1 3 2]
# ['USA' 'JPN' 'EU']
# object
π νν°λ§
import seaborn as sns
titanic = sns.load_dataset('titanic')
mask1 = (titanic.age >= 10) & (titanic.age < 20)
df_teenage = titanic.loc[mask1, :]
print(df_teenage)
import seaborn as sns
titanic = sns.load_dataset('titanic')
mask2 = (titanic.age < 10) & (titanic.sex == 'female')
df_female_under10 = titanic.loc[mask2, :]
print(df_female_under10.head())
import seaborn as sns
import pandas as pd
titanic = sns.load_dataset('titanic')
pd.set_option('display.max_columns',10) # μΆλ ₯ν μ΅λ μ΄μ κ°μ μ€μ
mask3 = titanic['sibsp'] == 3 # ν¨κ» νμΉν νμ λλ λ°°μ°μ μ 3, 4, 5
mask4 = titanic['sibsp'] == 4
mask5 = titanic['sibsp'] == 5
df_boolean = titanic[mask3 | mask4 | mask5]
print(df_boolean.head())
π λ°μ΄ν°νλ μ μ°κ²°
import pandas as pd
df1 = pd.DataFrame({'a':['a0','a1','a2','a3'],
'b':['b0','b1','b2','b3'],
'c':['c0','c1','c2','c3']},
index=[0,1,2,3])
df2 = pd.DataFrame({'a':['a2','a3','a4','a5'],
'b':['b2','b3','b4','b5'],
'c':['c2','c3','c4','b5'],
'd':['d2','d3','d4','d5']},
index=[2,3,4,5])
print(df1, '\n')
print(df2, '\n')
result1 = pd.concat([df1, df2])
print(result1, '\n')
result2 = pd.concat([df1, df2], ignore_index=True)
print(result2, '\n')
π df1, df2 μ°κ²°νλ©΄ df1μ 0,1,2,3 νμλ 'd' μ΄μ΄ μκΈ° λλ¬Έμ NaN μΌλ‘ μ λ ₯
π ignore_index μ΅μ μΆκ° μ, κΈ°μ‘΄ ν μΈλ±μ€λ₯Ό 무μνκ³ μλ‘μ΄ ν μΈλ±μ€λ₯Ό μ€μ
# 2κ°μ dataframeμ μ’μ° μ΄ λ°©ν₯μΌλ‘ μ΄μ΄ λΆμ΄λ― μ°κ²°νκΈ°
result3 = pd.concat([df1, df2], axis=1)
print(result3, '\n')
# join='inner' μ΅μ
μ μ© (κ΅μ§ν©)
result3_in = pd.concat([df1, df2], axis=1, join='inner')
print(result3_in, '\n')
λ°μν
'Python' μΉ΄ν κ³ λ¦¬μ λ€λ₯Έ κΈ
[Python] Pandas _ Time Series (0) | 2023.08.24 |
---|---|
[Python] Pandas _ Apply, Map (0) | 2023.08.24 |
[Python] Pandas data: auto-mpg data μκ°ν (0) | 2023.04.30 |
[Python] matplotlib, seaborn λ§λκ·Έλν 그리기 / κΎΈλ―ΈκΈ° (0) | 2023.04.17 |
[Python] docstring (λ¬Έμν / μ¬μ©μ μ μ ν¨μ ) (0) | 2023.04.13 |