資訊檢索實驗報告
[計算機][實驗一]
實驗題目
反向索引與布爾查詢
實驗内容
- 對所給的Tweets資料集建立反向索引;
- 實作Boolean Retrieval Model,使用TREC 2014 test topics進行測試;
- Boolean Retrieval Model中支援and, or ,not,查詢優化可選做;
實驗過程
- 資料預處理
先來看一下初始資料格式:
![](https://img.laitimes.com/img/9ZDMuAjOiMmIsIjOiQnIsICM38FdsYkRGZkRG9lcvx2bjxiNx8VZ6l2cs0TP350MrpXTwsmeNBDOsJGcohVYsR2MMBjVtJWd0ckW65UbM5WOHJWa5kHT20ESjBjUIF2X0hXZ0xCMx81dvRWYoNHLrdEZwZ1Rh5WNXp1bwNjW1ZUba9VZwlHdssmch1mclRXY39CXldWYtlWPzNXZj9mcw1ycz9WL49zZuBnL0cjN4UTOxETM3ADMxkTMwIzLc52YucWbp5GZzNmLn9Gbi1yZtl2Lc9CX6MHc0RHaiojIsJye.png)
資料集以推特為機關,每條推特上分為userName,clusterNo,text,timeStr,tweetId,errorCode,textCleaned,relevance屬性。
我們的目的是建構反向索引,需要的資訊主要是userName,text,tweetId,是以在預處理過程中,我使用python将資料集以tweet為機關進行讀取,并對字元串切片,完成對屬性分割。
核心代碼
lines = f.readlines()
for line in lines:
line = line[tweetid:errorcode] + line[username:clusterno] + line[text:timestr] #預處理 切片,提取資訊
terms = TextBlob(line).words.singularize()#分詞
terms = terms.lemmatize("v")#單詞變體還原
預處理後的文本如下所示,可以看到隻保留了關鍵資訊:
- 建立索引
建立一個清單postings,用于存放整個反向索引,對處理後每一條tweet的每一個單詞,将對應的tweedid增加到單詞之後。
#建立索引
for word in terms:
if word in postings.keys():
postings[word].append(tweetid)
else:
postings[word] = [tweetid]
建立完成的索引部分如下所示:
- 布爾查詢
單個布爾查詢
首先判斷所給term是否在postings中,如果在answer = postings[term],否則,answer=[]
多個布爾查詢
and/or聯成的布爾查詢,分開對每個單詞進行查詢,最後通過指針将多個查詢id序列同時周遊,以線性的複雜度完成對多個查詢的合并。
涉及3個或者3個以上的連接配接詞時,同樣可以先對每個單詞進行查詢,但兩兩合并時,可以優先選取長度較短的兩個清單合并。
涉及not的查詢,這裡使用的是對已經查的清單的每個單詞再次變量,删除在另一單詞個清單中的id。
for term in postings[term1]:
if term not in postings[term2]:
answer.append(term)
以部分TREC 2014 test資料為例,可以看到查詢結果
所有代碼:
import sys
from collections import defaultdict
from textblob import TextBlob
from textblob import Word
uselessTerm = ["username", "text", "tweetid"]
postings = defaultdict(dict)#inverted
def tokenize_tweet(document):
document = document.lower()
a = document.index("username")
b = document.index("clusterno")
c = document.rindex("tweetid") - 1
d = document.rindex("errorcode")
e = document.index("text")
f = document.index("timestr") - 3
#提取tweetid、username和tweet内容三部分主要資訊
document = document[c:d] + document[a:b] + document[e:f]#這裡直接重新定義document了
# print(document)
terms = TextBlob(document).words.singularize()
result = []#空清單
for word in terms:
expected_str = Word(word)
expected_str = expected_str.lemmatize("v")#單詞變體還原
if expected_str not in uselessTerm:#這裡還是去掉了無用單詞
result.append(expected_str)
return result
#讀取文檔
def get_postings():
global postings
f = open(r"C:\Users\ASUS\Desktop\tweets.txt")
lines = f.readlines() # 讀取全部内容
mylog = open(r"C:\Users\ASUS\Desktop\Inverted2.txt", mode='a', encoding='utf-8')
mylog2 = open(r"C:\Users\ASUS\Desktop\preprocessed.txt", mode='a', encoding='utf-8')
for line in lines:#每一行就是一條推特
line = tokenize_tweet(line)#這裡的line就是上面的document了
print(line, file=mylog2)
tweetid = line[0]
line.pop(0)#删除id
unique_terms = set(line)
for te in unique_terms:
if te in postings.keys():
postings[te].append(tweetid)
else:
postings[te] = [tweetid]
print(postings, file=mylog)
# 按字典序對postings進行升序排序,但傳回的是清單,失去了鍵值的資訊
# postings = sorted(postings.items(),key = lambda asd:asd[0],reverse=False)
mylog.close()
mylog2.close()
# posting本身就是已經建好的額反向索引
def merge2_and(term1, term2):
global postings
answer = []
if (term1 not in postings) or (term2 not in postings):
return answer
else:
i = len(postings[term1])
j = len(postings[term2])
x = 0
y = 0
while x < i and y < j:
if postings[term1][x] == postings[term2][y]:
answer.append(postings[term1][x])
x += 1
y += 1
elif postings[term1][x] < postings[term2][y]:
x += 1
else:
y += 1
return answer
def merge2_or(term1, term2):
answer = []
if (term1 not in postings) and (term2 not in postings):
answer = []
elif term2 not in postings:
answer = postings[term1]
elif term1 not in postings:
answer = postings[term2]
else:
answer = postings[term1]
for item in postings[term2]:
if item not in answer:
answer.append(item)
return answer
def merge2_not(term1, term2):
answer = []
if term1 not in postings:
return answer
elif term2 not in postings:
answer = postings[term1]
return answer
else:
answer = postings[term1]
ANS = []
for ter in answer:
if ter not in postings[term2]:
ANS.append(ter)
return ANS
def merge3_and(term1, term2, term3):
Answer = []
if term3 not in postings:
return Answer
else:
Answer = merge2_and(term1, term2)
if Answer == []:
return Answer
ans = []
i = len(Answer)
j = len(postings[term3])
x = 0
y = 0
while x < i and y < j:
if Answer[x] == postings[term3][y]:
ans.append(Answer[x])
x += 1
y += 1
elif Answer[x] < postings[term3][y]:
x += 1
else:
y += 1
return ans
def merge3_or(term1, term2, term3):
Answer = []
Answer = merge2_or(term1, term2);
if term3 not in postings:
return Answer
else:
if Answer == []:
Answer = postings[term3]
else:
for item in postings[term3]:
if item not in Answer:
Answer.append(item)
return Answer
def merge3_and_or(term1, term2, term3):
Answer = []
Answer = merge2_and(term1, term2)
if term3 not in postings:
return Answer
else:
if Answer == []:
Answer = postings[term3]
return Answer
else:
for item in postings[term3]:
if item not in Answer:
Answer.append(item)
return Answer
def merge3_or_and(term1, term2, term3):
Answer = []
Answer = merge2_or(term1, term2)
if (term3 not in postings) or (Answer == []):
return Answer
else:
ans = []
i = len(Answer)
j = len(postings[term3])
x = 0
y = 0
while x < i and y < j:
if Answer[x] == postings[term3][y]:
ans.append(Answer[x])
x += 1
y += 1
elif Answer[x] < postings[term3][y]:
x += 1
else:
y += 1
return ans
def do_rankSearch(terms):
Answer = defaultdict(dict)# mind dict meaning
for item in terms:
if item in postings:
for tweetid in postings[item]:
if tweetid in Answer:
Answer[tweetid] += 1
else:
Answer[tweetid] = 1
Answer = sorted(Answer.items(), key=lambda asd: asd[1], reverse=True)#感覺像統計詞頻
return Answer
def token(doc):
doc = doc.lower()
terms = TextBlob(doc).words.singularize()
result = []
for word in terms:
expected_str = Word(word)
expected_str = expected_str.lemmatize("v")
result.append(expected_str)
return result
def do_search():
terms = token(input("Search query >> "))
if terms == []:
sys.exit()
# 搜尋的結果答案
if len(terms) == 3:
# A and B
if terms[1] == "and":
answer = merge2_and(terms[0], terms[2])
print(answer)
# A or B
elif terms[1] == "or":
answer = merge2_or(terms[0], terms[2])
print(answer)
# A not B
elif terms[1] == "not":
answer = merge2_not(terms[0], terms[2])
print(answer)
# 輸入的三個詞格式不對
else:
print("input wrong!")
elif len(terms) == 5:
# A and B and C
if (terms[1] == "and") and (terms[3] == "and"):
answer = merge3_and(terms[0], terms[2], terms[4])
print(answer)
# A or B or C
elif (terms[1] == "or") and (terms[3] == "or"):
answer = merge3_or(terms[0], terms[2], terms[4])
print(answer)
# (A and B) or C
elif (terms[1] == "and") and (terms[3] == "or"):
answer = merge3_and_or(terms[0], terms[2], terms[4])
print(answer)
# (A or B) and C
elif (terms[1] == "or") and (terms[3] == "and"):
answer = merge3_or_and(terms[0], terms[2], terms[4])
print(answer)
else:
print("More format is not supported now!")
# 進行自然語言的排序查詢,傳回按相似度排序的最靠前的若幹個結果
else:
leng = len(terms)
answer = do_rankSearch(terms)
print("[Rank_Score: Tweetid]")
for (tweetid, score) in answer:
print(str(score / leng) + ": " + tweetid)
def main():
get_postings()
while True:
do_search()
if __name__ == "__main__":
main()
注:說實話,這份代碼不是我自己寫的,是我找的,然後我稍作修改,增加了一下預處理和倒排完成的輸出,但原創的作者這個代碼寫的真的很好,我舉個很簡單的細節(main函數裡面隻調用了兩個函數,别的什麼也沒有了),另外,從代碼風格,整體結果,變量函數命名,函數使用,都很好,相信認真看這份代碼的童鞋能學到很多。
附上代碼,資料集,處理過程資料集連結:
連結: https://pan.baidu.com/s/1271WUE-0kiu8sSNqDyF4Ew 提取碼: n9d8 複制這段内容後打開百度網盤手機App,操作更友善哦