簡單的需求,擷取重複次數最多的資料
@沛公
import random
_dict ={0:0,}
for i in range(1,1000000):
rd_num = random.randrange(0,999,1)
if rd_num in _dict:
_dict[rd_num] = _dict[rd_num]+1
else:
_dict[rd_num] = 1
#print(rd_num)
print(_dict)
items = _dict.items();
print(items[0:10])
items = sorted(items, lambda x,y:cmp(x[1],y[1]),reverse=True)
print(items[0:10])
4.0g 虛拟機測試 100w資料0.6s 。。。 一般了 後續添加sqlite
import sqlite3
import random
_MAX_VALUE = 100
_DATA_CNT = 1000000
conn = sqlite3.connect(‘test.db‘)
cur = conn.cursor()
def got_cnt(x):
cur.execute(‘‘‘SELECT times FROM sorted WHERE value=%s;‘‘‘ % x)
already = cur.fetchall()
length = len(already)
if length is not 0:
length = already[0][0];
#print(‘search for %s Fond %s‘ % (x,length))
return length
cur.execute(‘‘‘CREATE TABLE ramdon_data
(id INTEGER PRIMARY KEY NOT NULL,value INTEGER);‘‘‘)
cur.execute(‘‘‘CREATE TABLE sorted
(id INTEGER PRIMARY KEY NOT NULL,value INTEGER, times INTEGER);‘‘‘)
print("create ok")
for i in range(1,_DATA_CNT):
rd_num = random.randrange(0,_MAX_VALUE,1)
#print(rd_num)
cur.execute(‘‘‘INSERT INTO ramdon_data(value) VALUES
(%s);‘‘‘ % rd_num)
conn.commit();
print("data ready")
cur.execute(‘‘‘SELECT * FROM ramdon_data;‘‘‘)
ans = cur.fetchall()
for d in ans:
#print(d)
value = d[1];
times = got_cnt(d[1]) + 1
if times is 1:
cur.execute(‘‘‘INSERT INTO
sorted(value,times) VALUES(%s,%s);‘‘‘ % (value,times))
else:
cur.execute(‘‘‘UPDATE sorted
SET times=%s WHERE value=%s;‘‘‘ % (times,value))
conn.commit();
print("calc ready")
cur.execute(‘‘‘SELECT * FROM sorted ORDER BY times;‘‘‘);
ans = cur.fetchall();
print(‘‘‘
len is %s‘‘‘ % len(ans))
for d in ans:
print(d)
conn.close();
加了sqlite之後,慢多了 100w資料 30s。。。 當然 跟我毫無節操的讀寫資料有關系。無論如何。sqlite和python以及sql算是入門了
原文:http://my.oschina.net/mummy108/blog/476841