๋ฐ์ํ
๐ Google Colab ์ ์ด์ฉํด์ Python ์คํ
# google drive - google colab ์ฐ๋
from google.colab import drive
drive.mount('/content/drive')
๐ Pandas ํ์ผ ๋ถ๋ฌ์ค๊ธฐ
import pandas as pd
print('pandas version:', pd.__version__)
# pandas version: 1.5.3
* boson.csv ํ์ผ ๋ถ๋ฌ์ค๊ธฐ
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/2023/data/'
df_boston = pd.read_csv(DATA_PATH + 'boston.csv')
df_boston.head(1)
df_boston.info() # ๊ฒฐ์ธก์น ์์
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):dtypes: float64(14)
๐ Pandas ํ / ์ด ์ถ์ถํ๊ธฐ
# CRIM, ZN, INDUS ์ปฌ๋ผ ์ถ์ถ
cols = ['CRIM', 'ZN', 'INDUS']
result = df_boston.loc[:,cols] # ์ฒซ ๋ฒ์งธ ๋ฐฉ๋ฒ : loc ์ด์ฉ
result = df_boston[cols] # ๋ ๋ฒ์งธ ๋ฐฉ๋ฒ
result
# ํ ์ถ์ถ, ZN = 18.0 ์ธ ๊ฐ๋ง ์กฐํ, ์ปฌ๋ผ์ CRIM, ZN, INDUS ์ถ์ถ
# ๋๋ ์ ํ๋ ๋ฐฉ๋ฒ
result = df_boston['ZN'] == 18.0
cols = ['CRIM', 'ZN', 'INDUS']
result = result[cols]
result
# ํ ๋ฒ์ ํ๋ ๋ฐฉ๋ฒ
df_boston.loc[df_boston['ZN'] == 18.0, ['CRIM', 'ZN', 'INDUS']]
CRIM ZN INDUS
0 0.00632 18.0 2.31
# ์ปฌ๋ผ ์ด๋ฆ ์๋ชป ์
๋ ฅ ์ KeyError ๋ฐ์
df_boston.loc[df_boston['ZN'] == 18.0, ['CRIm', 'ZN', 'INDUS']]
KeyError: "['CRIm'] not in index"
๐ ๋ค์ค ์กฐ๊ฑด์
cols = ['CRIM', 'ZN', 'target']
# CRIM 1๋ณด๋ค ์๊ณ , target์ด 24 ์ด์์ธ ๊ฐ๋ง ์กฐํ
result = df_boston.loc[(df_boston['CRIM'] < 1) & (df_boston['target'] >= 24), cols]
result
138 rows × 3 columns
# CRIM 1๋ณด๋ค ์๊ฑฐ๋, target์ด 24 ์ด์์ธ ๊ฐ๋ง ์กฐํ
result = df_boston.loc[(df_boston['CRIM'] < 1) | (df_boston['target'] >= 24), cols]
result
351 rows × 3 columns
-> and(&) / or(|) ์ฐ์ฐ์์ ๋ฐ๋ผ row ์๊ฐ ๋ฌ๋ผ์ง์ ํ์ธ!
โ ์ปฌ๋ผ ์ด๋ฆ๊ฐ ๋ณ๊ฒฝ
# ์ปฌ๋ผ ์ด๋ฆ๊ฐ ๋ณ๊ฒฝ : rename() --> ๋์
๋๋ฆฌ ์ฌ์ฉ
* ZN --> landZone
result = df_boston.rename(columns={'ZN' : 'landZone'})
result.head(1)
# ์ปฌ๋ผ ์ด๋ฆ๊ฐ ๋ณ๊ฒฝ : AGE --> A, TAX --> T, PTRATIO --> PR
tempDict = {'AGE' : 'A',
'TAX' : 'T',
'PTRATIO' : 'PR'
}
result = df_boston.rename(columns=tempDict)
result.head(1)
CRIM ZN INDUS CHAS NOX RM A DIS RAD T PR B LSTAT target
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.09 1.0 296.0 15.3 396.9 4.98 24.0
โ Pandas Data ์ธ๊ธฐ
# value_counts()
df_boston['RAD'].value_counts()
24.0 132
5.0 115
4.0 110
3.0 38
6.0 26
2.0 24
8.0 24
1.0 20
7.0 17
Name: RAD, dtype: int64
# value_counts(normalize = )
df_boston['RAD'].value_counts(normalize = True) # ๋น์จ๋ก ๋์ด
24.0 0.260870
5.0 0.227273
4.0 0.217391
3.0 0.075099
6.0 0.051383
2.0 0.047431
8.0 0.047431
1.0 0.039526
7.0 0.033597
Name: RAD, dtype: float64
# np.round() - numpy() ์ฌ์ฉํ์ฌ ๋ฐ์ฌ๋ฆผ
import numpy as np
result = np.round(df_boston['RAD'].value_counts(normalize = True),2)
result
24.0 0.26
5.0 0.23
4.0 0.22
3.0 0.08
6.0 0.05
2.0 0.05
8.0 0.05
1.0 0.04
7.0 0.03
Name: RAD, dtype: float64
๐ข ์๊ณ์ด ๋ฐ์ดํฐ (pandas.Timestamp)
# ์๊ณ์ด ๋ฐ์ดํฐ ๋ค๋ฃจ๊ธฐ
date_string = "2023-04-13 15:52:01"
datetime_obj = pd.to_datetime(date_string)
print(datetime_obj)
print(type(datetime_obj))
2023-04-13 15:52:01
<class 'pandas._libs.tslibs.timestamps.Timestamp'>
# timestamp class
https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.html
# ์ฌ๋ฌ ๊ฐ์ ์๊ณ์ด ๋ฐ์ดํฐ
date_list = ['2023-04-12', '2023-04-13', '2023-04-14']
datetime_obj_list = pd.to_datetime(date_list)
print(datetime_obj_list)
print(type(datetime_obj_list))
DatetimeIndex(['2023-04-12', '2023-04-13', '2023-04-14'], dtype='datetime64[ns]', freq=None)
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
# DatetimeIndex class
https://pandas.pydata.org/docs/reference/api/pandas.DatetimeIndex.html
๐ object - datatime class ๋ณํ : For using method in datetime class
# object ๋ฅผ datetime class ํํ๋ก ๋ณํ
sales['datesold'] = pd.to_datetime(sales['datesold'])
sales['year'] = pd.to_datetime(sales['datesold']).dt.year
sales['month'] = pd.to_datetime(sales['datesold']).dt.month
sales['day'] = pd.to_datetime(sales['datesold']).dt.day
sales.head(1)
datesold postcode price propertyType bedrooms year month day
0 2007-02-07 2607 525000 house 4 2007 2 7
sales.tail(1)
datesold postcode price propertyType bedrooms year month day
29579 2019-07-26 2612 775000 unit 2 2019 7 26
# ๋ฐ์ดํฐ์ ์์ผ์ ๋ํ ๊ตฌ๊ฐ ๊ตฌํ๊ธฐ
sales['datesold'].max() - sales['datesold'].min()
Timedelta('4553 days 00:00:00')
๐ datetime method - shift()
# shift() - ์นธ์ ๊ฑด๋๋๋๋ก ์ค์
# Ex. ์ฃผ๊ฐ์์ ์ ์ผ/์ ์ฃผ ๋๋น ๋น๊ต ๊ฐ๋ฅ
# shift(์ซ์, fill_value = 0) : none ์ผ๋ก ์๋์ค๋๋ก / .astype(int) : int type ์ค์
temp_df = sales[['datesold', 'price']].copy()
temp_df['shifted_v1'] = temp_df['price'].shift(1, fill_value = 0).astype(int)
temp_df['shifted_v2'] = temp_df['price'].shift(2, fill_value = 0).astype(int)
temp_df['์ฐจ์ด๊ฐ'] = temp_df['price'] - temp_df['shifted_v1']
temp_df.head()
datesold price shifted_v1 shifted_v2 ์ฐจ์ด๊ฐ
0 2007-02-07 525000 0 0 25000
1 2007-02-27 290000 525000 0 -235000
2 2007-03-07 328000 290000 525000 38000
3 2007-03-09 380000 328000 290000 52000
4 2007-03-21 310000 380000 328000 -70000
# ํ์ํ์ ๋ง๋ค๊ธฐ - 0000๋
00์ 00์ผ
temp_df['ํ๊ธ๋ ์ง'] = temp_df['datesold'].dt.strftime('%Y๋
%m์ %d์ผ')
temp_df.head()
datesold price shifted_v1 shifted_v2 ์ฐจ์ด๊ฐ ํ๊ธ๋ ์ง
0 2007-02-07 525000 0 0 525000 2007๋
02์ 07์ผ
1 2007-02-27 290000 525000 0 -235000 2007๋
02์ 27์ผ
2 2007-03-07 328000 290000 525000 38000 2007๋
03์ 07์ผ
3 2007-03-09 380000 328000 290000 52000 2007๋
03์ 09์ผ
4 2007-03-21 310000 380000 328000 -70000 2007๋
03์ 21์ผ
๋ฐ์ํ
'Python' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[Python] matplotlib, seaborn ๋ง๋๊ทธ๋ํ ๊ทธ๋ฆฌ๊ธฐ / ๊พธ๋ฏธ๊ธฐ (0) | 2023.04.17 |
---|---|
[Python] docstring (๋ฌธ์ํ / ์ฌ์ฉ์ ์ ์ ํจ์ ) (0) | 2023.04.13 |
[Python] ๊ธฐ์ด๋ฌธ๋ฒ _ Sequence Type (List, Tuple, Dictionary) (3) | 2023.04.12 |
[Python] ๊ธฐ์ด๋ฌธ๋ฒ _ ๋ฌธ์์ด (String) (0) | 2023.04.12 |
[Python] Comprehension (List, Dictionary, Tuple) (0) | 2023.04.12 |