天天看點

python資料分析包pandas的使用方法

pandas 是基于NumPy 的一種工具,該工具是為了解決資料分析任務而建立的。Pandas 納入了大量庫和一些标準的資料模型,提供了高效地操作大型資料集所需的工具。pandas提供了大量能使我們快速便捷地處理資料的函數和方法。—百度百科

基本操作

>>>import pandas as pd
>>>test = pd.Series(['pig', 'girl', 35, -123123123])
>>>test
0           pig
1          girl
2            35
3    -123123123
dtype: object           
>>>test = pd.Series(['pig', 'girl', 35, -123123123],
                 index=['name','name','age','nums'])
>>>test
name           pig
name          girl
age             35
nums    -123123123
dtype: object           
>>>test = pd.Series(['pig', 'girl', 35, -123123123],
                 index=['name','name','age','nums'])
>>>test['name']
name     pig
name    girl
dtype: object
>>>test[['name','age']]
name     pig
name    girl
age       35
dtype: object
           
>>>data = {'year': [2016, 2015, 2017, 2014],
        'teams': ['Bears', 'Bears', 'Bears', 'Packers'],
        'wins': [11, 8, 10, 15],
        'losses': [5, 8, 6, 1]}
>>>football = pd.DataFrame(data)
>>>football
   losses    teams  wins  year
0       5    Bears    11  2016
1       8    Bears     8  2015
2       6    Bears    10  2017
3       1  Packers    15  2014
>>>football.dtypes
losses     int64
teams     object
wins       int64
year       int64
dtype: object
>>>football.describe()
        losses      wins         year
count  4.00000   4.00000     4.000000
mean   5.00000  11.00000  2015.500000
std    2.94392   2.94392     1.290994
min    1.00000   8.00000  2014.000000
25%    4.00000   9.50000  2014.750000
50%    5.50000  10.50000  2015.500000
75%    6.50000  12.00000  2016.250000
max    8.00000  15.00000  2017.000000
>>>football.head()
   losses    teams  wins  year
0       5    Bears    11  2016
1       8    Bears     8  2015
2       6    Bears    10  2017
3       1  Packers    15  2014
>>>football.tail()
   losses    teams  wins  year
0       5    Bears    11  2016
1       8    Bears     8  2015
2       6    Bears    10  2017
3       1  Packers    15  2014
>>>import numpy
>>> avg_medal_count = olympic_medal_counts_df[['gold', 'silver', 'bronze']].apply(numpy.mean)
avg_medal_count
gold      3.807692
silver    3.730769
bronze    3.807692
dtype: float64

           
>>> from pandas import DataFrame, Series 
>>>countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
             'Netherlands', 'Germany', 'Switzerland', 'Belarus',
             'Austria', 'France', 'Poland', 'China', 'Korea',
             'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
             'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
             'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']
gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
>>>olympic_medal_counts = {'country_name': Series(countries), 'gold': Series(gold),
                        'silver': Series(silver), 'bronze': Series(bronze)}
>>>olympic_medal_counts_df = DataFrame(olympic_medal_counts)
>>>olympic_medal_counts_df
    bronze    country_name  gold  silver
0        9    Russian Fed.    13      11
1       10          Norway    11       5
2        5          Canada    10      10
3       12   United States     9       7
4        9     Netherlands     8       7
5        5         Germany     8       6
6        2     Switzerland     6       3
7        1         Belarus     5       0
8        5         Austria     4       8
9        7          France     4       4
10       1          Poland     4       1
11       2           China     3       4
12       2           Korea     3       3
13       6          Sweden     2       7
14       2  Czech Republic     2       4
15       4        Slovenia     2       2
16       3           Japan     1       4
17       1         Finland     1       3
18       2   Great Britain     1       1
19       1         Ukraine     1       0
20       0        Slovakia     1       0
21       6           Italy     0       2
22       2          Latvia     0       2
23       1       Australia     0       2
24       0         Croatia     0       1
25       1      Kazakhstan     0       0
           
import pandas as pd
>>>data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
        'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                 'Lions', 'Lions'],
        'wins': [11, 8, 10, 15, 11, 6, 10, 4],
        'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
>>>football = pd.DataFrame(data)
>>>football['year']
0    2010
1    2011
2    2012
3    2011
4    2012
5    2010
6    2011
7    2012
Name: year, dtype: int64
>>>football.year
0    2010
1    2011
2    2012
3    2011
4    2012
5    2010
6    2011
7    2012
Name: year, dtype: int64
>>>football[['year', 'wins', 'losses']]
   year  wins  losses
0  2010    11       5
1  2011     8       8
2  2012    10       6
3  2011    15       1
4  2012    11       5
5  2010     6      10
6  2011    10       6
7  2012     4      12
           
>>>data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
            'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions',
                     'Lions', 'Lions'],
            'wins': [11, 8, 10, 15, 11, 6, 10, 4],
            'losses': [5, 8, 6, 1, 5, 10, 6, 12]}
>>>football = pd.DataFrame(data)
>>>football.iloc[[0]]
   losses   team  wins  year
0       5  Bears    11  2010
>>>football.loc[[0]]
   losses   team  wins  year
0       5  Bears    11  2010
>>>football[3:5]
   losses     team  wins  year
3       1  Packers    15  2011
4       5  Packers    11  2012
>>>football[football.wins > 10]
   losses     team  wins  year
0       5    Bears    11  2010
3       1  Packers    15  2011
4       5  Packers    11  2012
>>>football[(football.wins > 10) & (football.team == "Packers")]
   losses     team  wins  year
3       1  Packers    15  2011
4       5  Packers    11  2012