声明:代码的运行环境为Python3。Python3与Python2在一些细节上会有所不同,希望广大读者注意。本博客以代码为主,代码中会有详细的注释。相关文章将会发布在我的个人博客专栏《Python自然语言处理》,欢迎大家关注。
一、信息提取与分块
1、信息提取
# 信息提取
def ie_preprocess(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences] # 词性标注
2、分块
# 正则表达式分块
grammar = r"""
NP: {<DT|PP\$>?<JJ>*<NN>} # chunk determiner/possessive, adjectives and nouns
{<NNP>+} # chunk sequences of proper nouns
""" # 定义分块的语法
cp = nltk.RegexpParser(grammar) # 定义分块器
sentence = [("Rapunzel", "NNP"), ("let", "VBD"), ("down", "RP"),
("her", "PP$"), ("long", "JJ"), ("golden", "JJ"), ("hair", "NN")]
result = cp.parse(sentence) # 将分块器应用于已有的句子中
print(result) # 打印结果
result.draw() # 树状结构表示出来
分块之后的结果及树状表示如下所示:
(S
(NP Rapunzel/NNP)
let/VBD
down/RP
(NP her/PP$ long/JJ golden/JJ hair/NN))
3、例子
# 探索文本语料库
cp = nltk.RegexpParser('CHUNK: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents():
tree = cp.parse(sent)
for subtree in tree.subtrees():
if subtree.label() == 'CHUNK': print(subtree)
# 加缝隙
grammar = r"""
NP:
{<.*>+} # Chunk everything
}<VBD|IN>+{ # Chink sequences of VBD and IN
"""
sentence = [("the", "DT"), ("little", "JJ"), ("yellow", "JJ"),
("dog", "NN"), ("barked", "VBD"), ("at", "IN"), ("the", "DT"), ("cat", "NN")]
cp = nltk.RegexpParser(grammar)
print(cp.parse(sentence))
二、评估分块器
1、评估基准
from nltk.corpus import conll2000
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))
当RegexpParser为空是,可以得到评估的结果为:
ChunkParse score:
IOB Accuracy: 43.4%%
Precision: 0.0%%
Recall: 0.0%%
F-Measure: 0.0%%
2、简单评估
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
print(cp.evaluate(test_sents))
此时的评估结果为:
ChunkParse score:
IOB Accuracy: 87.7%%
Precision: 70.6%%
Recall: 67.8%%
F-Measure: 69.2%%
3、使用unigram标注器对名词短语分块
# 使用unigram标注器对名词短语分块
class UnigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.UnigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
in zip(sentence, chunktags)]
return nltk.chunk.conlltags2tree(conlltags)
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))
得到的结果为:
ChunkParse score:
IOB Accuracy: 92.9%%
Precision: 79.9%%
Recall: 86.8%%
F-Measure: 83.2%%
4、使用Bigram标注器分块
class BigramChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = nltk.BigramTagger(train_data)
def parse(self, sentence):
pos_tags = [pos for (word, pos) in sentence]
tagged_pos_tags = self.tagger.tag(pos_tags)
chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
conlltags = [(word, pos, chunktag) for ((word, pos), chunktag)
in zip(sentence, chunktags)]
return nltk.chunk.conlltags2tree(conlltags)
bigram_chunker = BigramChunker(train_sents)
print(bigram_chunker.evaluate(test_sents))
得到的结果为:
ChunkParse score:
IOB Accuracy: 93.3%%
Precision: 82.3%%
Recall: 86.8%%
F-Measure: 84.5%%
5、训练基于分类器的分块器
(1)
# 训练基于分类器的分块器
class ConsecutiveNPChunkTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featureset = npchunk_features(untagged_sent, i, history)
train_set.append((featureset, tag))
history.append(tag)
self.classifier = nltk.MaxentClassifier.train(train_set, trace=0) # 使用最大熵分类器(此处使用此分类器比使用朴素贝叶斯分类器效果好)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = npchunk_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
class ConsecutiveNPChunker(nltk.ChunkParserI):
def __init__(self, train_sents):
tagged_sents = [[((w, t), c) for (w, t, c) in nltk.chunk.tree2conlltags(sent)]
for sent in train_sents]
self.tagger = ConsecutiveNPChunkTagger(tagged_sents)
def parse(self, sentence):
tagged_sents = self.tagger.tag(sentence)
conlltags = [(w, t, c) for ((w, t), c) in tagged_sents]
return nltk.chunk.conlltags2tree(conlltags)
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
return {"pos": pos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
(2)
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i - 1]
return {"pos": pos, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
(3)
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i - 1]
return {"pos": pos, "word": word, "prevpos": prevpos}
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
(4)
def npchunk_features(sentence, i, history):
word, pos = sentence[i]
if i == 0:
prevword, prevpos = "<START>", "<START>"
else:
prevword, prevpos = sentence[i - 1]
if i == len(sentence) - 1:
nextword, nextpos = "<END>", "<END>"
else:
nextword, nextpos = sentence[i + 1]
return {"pos": pos,
"word": word,
"prevpos": prevpos,
"nextpos": nextpos,
"prevpos+pos": "%s+%s" % (prevpos, pos),
"pos+nextpos": "%s+%s" % (pos, nextpos),
"tags-since-dt": tags_since_dt(sentence, i)}
def tags_since_dt(sentence, i):
tags = set()
for word, pos in sentence[:i]:
if pos == 'DT':
tags = set()
else:
tags.add(pos)
return '+'.join(sorted(tags))
chunker = ConsecutiveNPChunker(train_sents)
print(chunker.evaluate(test_sents))
三、语言结构中的递归
1、用级联分块器构建嵌套结构
# 用级联分块器构建嵌套结构
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
print(cp.parse(sentence))
结果为:
(S
(NP Mary/NN)
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))
再次测试:
cp = nltk.RegexpParser(grammar, loop=2)
print(cp.parse(sentence))
结果为:
(S
(CLAUSE
(NP Mary/NN)
(VP
saw/VBD
(CLAUSE
(NP the/DT cat/NN)
(VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))
2、树与遍历
(1)树
tree1 = nltk.Tree('NP', ['Alice'])
tree2 = nltk.Tree('NP', ['the', 'rabbit'])
tree3 = nltk.Tree('VP', ['chased', tree2])
tree4 = nltk.Tree('S', [tree1, tree3])
print(tree4)
结果为:
(S (NP Alice) (VP chased (NP the rabbit)))
(2)遍历
# 遍历树
def traverse(t):
try:
t.label()
except AttributeError:
print(t),
else:
# Now we know that t.node is defined
print('(', t.label(),)
for child in t:
traverse(child)
print(')',)
traverse(tree4)
结果:
( S
( NP
Alice
)
( VP
chased
( NP
the
rabbit
)
)
)
3、关系抽取
import re
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
print(nltk.sem.relextract.rtuple(rel))
from nltk.corpus import conll2002
vnv = """
(
is/V| # 3rd sing present and
was/V| # past forms of the verb zijn ('be')
werd/V| # and also present
wordt/V # past of worden ('become')
)
.* # followed by anything
van/Prep # followed by van ('of')
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
for r in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
print(nltk.sem.relextract.rtuple(r))