天天看点

《利用Python 进行数据分析》第六章:数据加载、储存和文件格式1、数据加载2、二进制数据格式3、使用HDF5格式4、读取Microsoft Excel文件5、使用html和Web api6、使用数据库

       对《利用Python 进行数据分析》(Wes Mckinney著)一书中的第六章中数据加载、储存和文件格式进行代码实验。原书中采用的是Python2.7,而我采用的Python3.7在Pycharm调试的,因此对书中源代码进行了一定的修改,每步打印结果(除“随机”相关外)与原文校验对照一致(输出结果在注释中,简单的输出就没写结果),全手工敲写,供参考。

       Pdf文档和数据集参见:《利用Python 进行数据分析》第二章:引言中的分析代码(含pdf和数据集下载链接)

    数据加载、储存和文件格式:

  • 1、数据加载
    • 1.1 读写文本格式数据
    • 1.2 逐块读取文本文件
    • 1.3 将数据写出到文本格式
    • 1.4 手工处理分割符格式
    • 1.5 JSON数据
    • 1.6 XML和HTML:Web信息收集
  • 2、二进制数据格式
  • 3、使用HDF5格式
  • 4、读取Microsoft Excel文件
  • 5、使用html和Web api
  • 6、使用数据库

       因为代码过长,放在一个代码段中显得冗长,因此进行了拆分,如下的库引入每个代码段中均可能有必要。

# -*- coding:utf-8 -*-
import pandas as pd
from pandas import Series
           

1、数据加载

1.1 读写文本格式数据

df = pd.read_csv('./python_data/ch06/ex1.csv')
print(df)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
df2 = pd.read_table('./python_data/ch06/ex1.csv', sep=',')
print(df2)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
# 文件不包含报题时读取文件
df = pd.read_csv('./python_data/ch06/ex2.csv', header = None)
print(df)
'''
   0   1   2   3      4
0  1   2   3   4  hello
1  5   6   7   8  world
2  9  10  11  12    foo
'''
# 没有标题,可以自定义列名
df = pd.read_csv('./python_data/ch06/ex2.csv', names = ['a', 'b', 'c', 'd', 'message'])
print(df)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
names = ['a', 'b', 'c', 'd', 'message']
# 将message列做成DataFrame的索引
df = pd.read_csv('./python_data/ch06/ex2.csv', names = names, index_col = 'message')
print(df)
'''
         a   b   c   d
message               
hello    1   2   3   4
world    5   6   7   8
foo      9  10  11  12
'''
# 将多个列做成一个层次化索引,只需传入列编号或列名组成的列表即可
parsed = pd.read_csv('./python_data/ch06/csv_mindex.csv', index_col = ['key1', 'key2'])
print(parsed)
'''
           value1  value2
key1 key2                
one  a          1       2
     b          3       4
     c          5       6
     d          7       8
two  a          9      10
     b         11      12
     c         13      14
     d         15      16
'''
# 对于不能用固定分割符去分割字段的(如空白符或其他模式),可以用正则表达式作为read_table的分隔符
result = pd.read_table('./python_data/ch06/ex3.txt', sep ='\s+')
print(result)
'''
            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491
'''

#对于有异常数据的行数,可以使用skiprows跳过
df = pd.read_table('./python_data/ch06/ex4.csv', skiprows=[0,2,3])
print(df)
'''
  a,b,c,d,message
0   1,2,3,4,hello
1   5,6,7,8,world
2  9,10,11,12,foo
'''

# Pandas会用一组经常出现的标记值进行标识,如NA,-1.#IND以及NULL等
result = pd.read_csv('./python_data/ch06/ex5.csv')
print(result)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''
# 检测DataFrame中是否有空值,返回bool值
print(pd.isnull(result))
'''
   something      a      b      c      d  message
0      False  False  False  False  False     True
1      False  False  False   True  False    False
2      False  False  False  False  False    False
'''

# na_vaules可以接受一组用于表示缺失值得字符串
result = pd.read_csv('./python_data/ch06/ex5.csv', na_values=['NULL'])
print(result)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''

# 可以用一个字典为各列指定不同的NA标记值,即将所给的标签值置为NA
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
result = pd.read_csv('./python_data/ch06/ex5.csv', na_values=sentinels)
print(result)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       NaN  5   6   NaN   8   world
2     three  9  10  11.0  12     NaN
'''
           

1.2 逐块读取文本文件

result = pd.read_csv('./python_data/ch06/ex6.csv')
print(result)
'''
           one       two     three      four key
0     0.467976 -0.038649 -0.295344 -1.824726   L
1    -0.358893  1.404453  0.704965 -0.200638   B
2    -0.501840  0.659254 -0.421691 -0.057688   G
3     0.204886  1.074134  1.388361 -0.982404   R
4     0.354628 -0.133116  0.283763 -0.837063   Q
...        ...       ...       ...       ...  ..
9995  2.311896 -0.417070 -1.409599 -0.515821   L
9996 -0.479893 -0.650419  0.745152 -0.646038   E
9997  0.523331  0.787112  0.486066  1.093156   K
9998 -0.362559  0.598894 -1.843201  0.887292   G
9999 -0.096376 -1.012999 -0.657431 -0.573315   0

[10000 rows x 5 columns]
'''
# 上述文件较大,如果只想读取几行,可以通过nrows指定
result = pd.read_csv('./python_data/ch06/ex6.csv', nrows = 5)
print(result)
'''
        one       two     three      four key
0  0.467976 -0.038649 -0.295344 -1.824726   L
1 -0.358893  1.404453  0.704965 -0.200638   B
2 -0.501840  0.659254 -0.421691 -0.057688   G
3  0.204886  1.074134  1.388361 -0.982404   R
4  0.354628 -0.133116  0.283763 -0.837063   Q
'''
# 如果要逐块读取文件,需要设置chunksize(行数)
chunker =pd.read_csv('./python_data/ch06/ex6.csv', chunksize= 1000)
print(chunker) # <pandas.io.parsers.TextFileReader object at 0x0000015DAA4A8A48>
tot =Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value = 0)
tot = tot.sort_values( ascending = False)
print(tot[:10])
'''
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64
'''
           

1.3 将数据写出到文本格式

data = pd.read_csv('./python_data/ch06/ex5.csv')
data.to_csv('./python_data/ch06/out.csv',index = False) # 不加index =False,储存的文件会多加一个索引列
out = pd.read_csv('./python_data/ch06/out.csv')
print(out)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''
# 保存文件时,可以用sep更改分隔符
data.to_csv('./python_data/ch06/out.csv',sep = '|', index = False)
data_sep = pd.read_csv('./python_data/ch06/out.csv', sep = '|')
print(data_sep)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''

# 缺失值在输出结果默认保存成空字符串,可以将其改为为别的标记值
data.to_csv('./python_data/ch06/out3.csv',na_rep = 'NULL', index = False)
data3 = pd.read_csv('./python_data/ch06/out3.csv')
print(data3)
'''
  something  a   b     c   d message
0       one  1   2   3.0   4     NaN
1       two  5   6   NaN   8   world
2     three  9  10  11.0  12     foo
'''

# 如果没有设置其他想,则会写出行和列的标签,可以禁用掉
data.to_csv('./python_data/ch06/out4.csv',index = False, header = False)
data4 = pd.read_csv('./python_data/ch06/out4.csv')
print(data4)
'''
     one  1   2   3.0   4 Unnamed: 5
0    two  5   6   NaN   8      world
1  three  9  10  11.0  12        foo
'''

# 可以只写一部分的列,并以指定的顺序排列
data.to_csv('./python_data/ch06/out5.csv',index = False, columns=['a','b','c'])
data5 = pd.read_csv('./python_data/ch06/out5.csv')
print(data5)
'''
   a   b     c
0  1   2   3.0
1  5   6   NaN
2  9  10  11.0
'''

# Series也有to_csv方法
import numpy as np
dates = pd.date_range('1/1/2000', periods= 7)
ts = Series(np.arange(7), index = dates)
ts.to_csv('./python_data/ch06/out6.csv')
data_ts = pd.read_csv('./python_data/ch06/out6.csv')
print(data_ts)
'''
   Unnamed: 0  0
0  2000-01-01  0
1  2000-01-02  1
2  2000-01-03  2
3  2000-01-04  3
4  2000-01-05  4
5  2000-01-06  5
6  2000-01-07  6
'''
# Python3.7中的Series 应该没有from_csv()函数
           

1.4 手工处理分割符格式

import csv
f = open('./python_data/ch06/ex7.csv')
reader = csv.reader(f)
# 对这个reader进行迭代将会为每行产生一个元组(并移除所有的引号)
for line in reader:
    print(line)
'''
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']
'''
lines = list(csv.reader(open('./python_data/ch06/ex7.csv')))
header,values = lines[0], lines[1:]
data_dict = {h: v for h, v in zip(header, zip(*values))}
print(data_dict) # {'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

# 自定义新格式
class my_dialect(csv.Dialect):
    lineterminator = '\n'
    delimiter = ';'
    quotechar = '"'
    quoting = csv.QUOTE_MINIMAL # 原文中没有这句话,需要加上,否则会报错

reader = csv.reader(f, dialect = my_dialect)
# reader = csv.reader(f, delimiter = '|')

# 手工输出分隔符文件,可以使用csv.writer
with open('mydata.csv', 'w') as f:
    writer = csv.writer(f, dialect=my_dialect)
    writer.writerow(('one', 'two' ,'three'))
    writer.writerow(('1', '2', '3'))
    writer.writerow(('4', '5', '6'))
    writer.writerow(('7', '8', '9'))
           

1.5 JSON数据

# JSON数据格式,比表格型文本(如CSV)灵活得多的格式
obj = """
{"name": "Wes",
"places_lived":["United States", "Spain", "Germany"],
"pet":null,
"siblings":[{"name": "Scott", "age":25, "pet": "Zuko"},
            {"name": "Kitie", "age":33, "pet": "Cisco"}]
}
"""
import json
from pandas import DataFrame
# 通过json.loads即可将JSON字符串转换成Python形式
result = json.loads(obj)
print(result)
'''
{'name': 'Wes', 
'places_lived': ['United States', 'Spain', 'Germany'], 
'pet': None, 
'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'}, {'name': 'Kitie', 'age': 33, 'pet': 'Cisco'}]}
'''
asjson = json.dumps(result)

# 向DataFrame构造器传入一组JSON对象,并选取数据字段的子集
siblings = DataFrame(result['siblings'], columns = ['name', 'age'])
print(siblings)
'''
    name  age
0  Scott   25
1  Kitie   33
'''
           

1.6 XML和HTML:Web信息收集

# 获取数据的URL,利用urllib2将其打开,然后用lxml解析得到的数据流
from lxml.html import parse
from urllib.request import urlopen
parsed = parse(urlopen('https://finance.yahoo.com/quote/AAPL/options?ltr=1'))
doc = parsed.getroot()
# 得到该文档中所有的URL链接,HTML中的链接是a标签
links = doc.findall('.//a')
print(links[15:20])
'''[<Element a at 0x1c9f65ee408>, <Element a at 0x1c9f65ee318>, <Element a at 0x1c9f65ee7c8>, <Element a at 0x1c9f65ee0e8>, <Element a at 0x1c9f777e2c8>]'''

# 获取文档中的全部URL
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
print(urls[-10:])
'''
['https://help.yahoo.com/kb/finance-for-web/SLN2310.html?locale=en_US', 
'https://help.yahoo.com/kb/finance-for-web', 
'https://yahoo.uservoice.com/forums/382977', 
'https://policies.oath.com/us/en/oath/privacy/index.html', 
'https://policies.oath.com/us/en/oath/privacy/adinfo/index.html',
'https://www.verizonmedia.com/policies/us/en/verizonmedia/terms/otos/index.html', 
'https://finance.yahoo.com/sitemap/', 
'https://twitter.com/YahooFinance', 
'https://facebook.com/yahoofinance', 
'https://www.linkedin.com/company/yahoo-finance']
'''

tables = doc.findall('.//table')
# 卖权(Puts) and 买权(Calls) 买卖股票的期权,刚开始没看懂,查了一下资料
calls = tables[0]
puts = tables[1]
# 每个表格都有一个标题行,然后才是数据行
rows = calls.findall('.//tr')

def _unpack(rows, kind='td'):
    elts = rows.findall('.//%s' %kind)
    return [val.text_content() for val in elts]

# 获取每个单元格的文本,标题行就是th单元格,对于数据行,就是td单元格
print(_unpack(rows[0], kind='th'))
'''['Contract Name', 'Last Trade Date', 'Strike', 'Last Price', 'Bid', 'Ask', 'Change', '% Change', 'Volume', 'Open Interest', 'Implied Volatility']'''
print(_unpack(rows[1], kind = 'td'))
'''['AAPL200925C00057500', '2020-09-18 12:45PM EDT', '57.50', '49.65', '48.85', '49.75', '-2.45', '-4.70%', '22', '32', '251.95%']'''

# 把所有步骤结合起来,将数据转化为一个DateFrame
from pandas.io.parsers import TextParser
def parse_options_data(table):
    rows = table.findall('.//tr')
    header = _unpack(rows[0],kind='th')
    data = [_unpack(r) for r in rows[1:]]
    return TextParser(data, names = header).get_chunk()

call_data = parse_options_data(calls) # 买入数据
put_data = parse_options_data(puts)  # 卖出数据
print(call_data[:10])
'''
         Contract Name         Last Trade Date  ...  Open Interest  Implied Volatility
0  AAPL200925C00057500  2020-09-18 12:45PM EDT  ...             32             251.95%
1  AAPL200925C00060000  2020-09-16 12:24PM EDT  ...             38             236.72%
2  AAPL200925C00061250   2020-09-11 1:22PM EDT  ...              0             229.30%
3  AAPL200925C00062500  2020-09-18 12:40PM EDT  ...              1             222.07%
4  AAPL200925C00063750  2020-09-18 11:36AM EDT  ...             73             215.04%
5  AAPL200925C00065000   2020-09-16 2:16PM EDT  ...             21             208.01%
6  AAPL200925C00066250  2020-09-18 12:48PM EDT  ...            112             205.76%
7  AAPL200925C00067500   2020-09-17 9:48AM EDT  ...            333             198.83%
8  AAPL200925C00068750  2020-09-18 11:04AM EDT  ...             72             192.19%
9  AAPL200925C00070000   2020-09-18 3:41PM EDT  ...            409             185.55%
'''

# 利用xml.objectify解析XML
from lxml import objectify

# 通过lxm.objectify解析该文件,然后通过getroot得到该XML文件的根节点的引用
path = './python_data/ch06/mta_perf/Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
data = []
skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_CHANGE', 'DECIMAL_PLACES']

for elt in root.INDICATOR:
    elt_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
           continue
        elt_data[child.tag] = child.pyval
    data.append(elt_data)

# 将这组字典转换为一个DataFrame
perf = DataFrame(data)
print(perf.head())
'''
            AGENCY_NAME  ... MONTHLY_ACTUAL
0  Metro-North Railroad  ...           96.9
1  Metro-North Railroad  ...             95
2  Metro-North Railroad  ...           96.9
3  Metro-North Railroad  ...           98.3
4  Metro-North Railroad  ...           95.8
'''

# HTML的链接标记的访问
from io import StringIO  # python3中用io取代了StringIO
tag = '<a href = "http://www.goole.com">Google</a>'
root = objectify.parse(StringIO(tag)).getroot()
# 访问链接文本或标记中的任何字段(如href)
print(root)  # Google ??
print(root.get('href'))  # http://www.goole.com
print(root.text) # Google
           

2、二进制数据格式

frame = pd.read_csv('python_data/ch06/ex1.csv')
print(frame)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
# 以二进制格式储存
frame.to_pickle('python_data/ch06/frame_pickle') # python 3.7中没有save()函数,改为了to_pickle()函数
# 加载二进制数据
data = pd.read_pickle('python_data/ch06/frame_pickle')  # python3.7中的load()函数改为了read_pickle()函数
print(data)
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
           

3、使用HDF5格式

# HDF5中HDF指层次型数据格式,每个HDF5都有一个文件式的节点结构,可以储存多个数据集并支持元数据
# pandas有一个最小化的类似于字典的HDFStore类,它通过PyTables储存pandas对象
frame = pd.read_csv('python_data/ch06/ex1.csv')
store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
print(store)
'''
<class 'pandas.io.pytables.HDFStore'>
File path: mydata.h5
'''

# HDF5文件中的对象可以通过与字典一样的方式进行获取
print(store['obj1'])
'''
   a   b   c   d message
0  1   2   3   4   hello
1  5   6   7   8   world
2  9  10  11  12     foo
'''
           

4、读取Microsoft Excel文件

# ExcelFile用到了xlrd和openpyxl包,需要先安装
xls_file = pd.ExcelFile('python_data/ch06/data.xls')
table = xls_file.parse('Sheet1')
print(table.head())
'''
    name  age
0  text1   12
1  test2   23
2   deqw   13
3   qweq    3
'''
           

5、使用html和Web api

import requests
# 搜索pandas, 发送一个HTTP GET请求
url = 'https://www.baidu.com/s?wd=pandas&rsv_spt=1'
resp = requests.get(url)
print(resp) # <Response [200]>

import json
# print(resp.text)
data = json.loads(resp.text)
print(data.keys())  

# 本节没有调试通过,貌似给出的url有点问题,留待后续验证
           

6、使用数据库

# 使用一款嵌入式的SQLite数据库
import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20), b VARCHAR(20),
c REAL,         d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()

# 插入数据
data = [('Atlanta', 'Georgia', 1.25, 6),
        ('Tallahassee', 'Florida', 2.6, 3),
        ('Sacramento', 'California', 1.7, 5)]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
con.commit()

# Python SQL驱动器会返回一个元组列表
cursor = con.execute('select * from test')
rows = cursor.fetchall()
print(rows)
'''
[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3), 
 ('Sacramento', 'California', 1.7, 5)]
 '''

# 将元组传给DataFrame的构造器,但是需要列名
print(cursor.description)
'''
(('a', None, None, None, None, None, None), 
 ('b', None, None, None, None, None, None), 
 ('c', None, None, None, None, None, None), 
 ('d', None, None, None, None, None, None))
'''
zip_object = list(zip(*cursor.description)) # 转化为一个list对象
df = DataFrame(rows, columns=zip_object[0])
print(df)
'''
             a           b     c  d
0      Atlanta     Georgia  1.25  6
1  Tallahassee     Florida  2.60  3
2   Sacramento  California  1.70  5
'''

# pandas有一个可以简化上述过程的read_frame函数,需要传入select语句和连接对象
import pandas.io.sql as sql
ret = sql.read_sql_query('select * from test', con)
print(ret)
'''
             a           b     c  d
0      Atlanta     Georgia  1.25  6
1  Tallahassee     Florida  2.60  3
2   Sacramento  California  1.70  5
'''
           

继续阅读