【資料分析可視化】通過去重進行資料清洗

2022-02-14 14:28:23

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

# 讀取剛剛分解處理完的傳回資料
link_csv = '/Users/bennyrhys/Desktop/資料分析可視化-資料集/homework/demo_duplicate.csv'
df = pd.read_csv(link_csv)
df

Unnamed: 0	Price	Seqno	Symbol	time
1623.0	0.0	APPL	1473411962
1	1	1623.0	0.0	APPL	1473411962
2	2	1623.0	0.0	APPL	1473411963
3	3	1623.0	0.0	APPL	1473411963
4	4	1649.0	1.0	APPL	1473411963

# 删掉無用的unname
del df['Unnamed: 0']
df

Price	Seqno	Symbol	time
1623.0	0.0	APPL	1473411962
1	1623.0	0.0	APPL	1473411962
2	1623.0	0.0	APPL	1473411963
3	1623.0	0.0	APPL	1473411963
4	1649.0	1.0	APPL	1473411963

df.size

len(df)

# 檢視no列有多少重複的
df['Seqno'].unique()

array([0., 1.])

len(df['Seqno'].unique())

# 檢測是否與前邊重複
df['Seqno'].duplicated()

0    False
1     True
2     True
3     True
4    False
Name: Seqno, dtype: bool

# 删掉重複的資料也就是上方展示為true的資料
df['Seqno'].drop_duplicates()

0    0.0
4    1.0
Name: Seqno, dtype: float64

# 這樣範圍局限，無法展示全部（Series）
type(df['Seqno'].drop_duplicates())

pandas.core.series.Series

# 這樣no列重複值删不感覺（不傳參，則整體考慮某列重複最小處理原則）
df.drop_duplicates()

Price	Seqno	Symbol	time
1623.0	0.0	APPL	1473411962
2	1623.0	0.0	APPL	1473411963
4	1649.0	1.0	APPL	1473411963

# 在DataFrame狀态下進行處理(暫時全部)
df.drop_duplicates(['Seqno'])

Price	Seqno	Symbol	time
1623.0	0.0	APPL	1473411962
4	1649.0	1.0	APPL	1473411963

# 去重 參數(保留最後出現的)
df.drop_duplicates(['Seqno'],keep='last')

Price	Seqno	Symbol	time
3	1623.0	0.0	APPL	1473411963
4	1649.0	1.0	APPL	1473411963

繼續閱讀