import numpy as np
import pandas as pd
from pandas import Series,DataFrame
# 讀取剛剛分解處理完的傳回資料
link_csv = '/Users/bennyrhys/Desktop/資料分析可視化-資料集/homework/demo_duplicate.csv'
df = pd.read_csv(link_csv)
df
Unnamed: 0 | Price | Seqno | Symbol | time |
1623.0 | 0.0 | APPL | 1473411962 |
1 | 1 | 1623.0 | 0.0 | APPL | 1473411962 |
2 | 2 | 1623.0 | 0.0 | APPL | 1473411963 |
3 | 3 | 1623.0 | 0.0 | APPL | 1473411963 |
4 | 4 | 1649.0 | 1.0 | APPL | 1473411963 |
# 删掉無用的unname
del df['Unnamed: 0']
df
Price | Seqno | Symbol | time |
1623.0 | 0.0 | APPL | 1473411962 |
1 | 1623.0 | 0.0 | APPL | 1473411962 |
2 | 1623.0 | 0.0 | APPL | 1473411963 |
3 | 1623.0 | 0.0 | APPL | 1473411963 |
4 | 1649.0 | 1.0 | APPL | 1473411963 |
df.size
20
len(df)
5
# 檢視no列有多少重複的
df['Seqno'].unique()
array([0., 1.])
len(df['Seqno'].unique())
2
# 檢測是否與前邊重複
df['Seqno'].duplicated()
0 False
1 True
2 True
3 True
4 False
Name: Seqno, dtype: bool
# 删掉重複的資料也就是上方展示為true的資料
df['Seqno'].drop_duplicates()
0 0.0
4 1.0
Name: Seqno, dtype: float64
# 這樣範圍局限,無法展示全部(Series)
type(df['Seqno'].drop_duplicates())
pandas.core.series.Series
# 這樣no列重複值删不感覺(不傳參,則整體考慮某列重複最小處理原則)
df.drop_duplicates()
Price | Seqno | Symbol | time |
1623.0 | 0.0 | APPL | 1473411962 |
2 | 1623.0 | 0.0 | APPL | 1473411963 |
4 | 1649.0 | 1.0 | APPL | 1473411963 |
# 在DataFrame狀态下進行處理(暫時全部)
df.drop_duplicates(['Seqno'])
Price | Seqno | Symbol | time |
1623.0 | 0.0 | APPL | 1473411962 |
4 | 1649.0 | 1.0 | APPL | 1473411963 |
# 去重 參數(保留最後出現的)
df.drop_duplicates(['Seqno'],keep='last')
Price | Seqno | Symbol | time |
3 | 1623.0 | 0.0 | APPL | 1473411963 |
4 | 1649.0 | 1.0 | APPL | 1473411963 |