tag标注說明
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
1.詞性标注器
text=word_tokenize('And now for something completely different')
print(pos_tag(text))
out:[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]
2.str2tuple()建立标注元組
直接從一個字元串構造一個已标注的辨別符的連結清單。
第一步是對字元串分詞以 便能通路單獨的詞/标記字元串,然後将每一個轉換成一個元組(使用 str2tuple())
sent = '''
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS '''
[nltk.tag.str2tuple(t) for t in sent.split()]
out:[('The', 'AT'),
('grand', 'JJ'),
('jury', 'NN'),
('commented', 'VBD'),
('on', 'IN'),
('a', 'AT'),
('number', 'NN'),
('of', 'IN'),
('other', 'AP'),
('topics', 'NNS')]
print (nltk.corpus.brown.tagged_words())
[('The', 'AT'), ('Fulton', 'NP-TL'), ...]
#為避免标記的複雜化,可設定tagset為‘universal’
print (nltk.corpus.brown.tagged_words(tagset='universal'))
[('The', 'DET'), ('Fulton', 'NOUN'), ...]
3.nltk.bigrams(tokens) 和 nltk.trigrams(tokens) nltk.bigrams(tokens) 和 nltk.trigrams(tokens)
一般如果隻是要求窮舉雙連詞或三連詞,則可以直接用nltk中的函數bigrams()或trigrams(), 效果如下面代碼:
import nltk
str='you are my sunshine, and all of things are so beautiful just for you.'
tokens=nltk.wordpunct_tokenize(str)
bigram=nltk.bigrams(tokens)
bigram
list(bigram)
trigram=nltk.trigrams(tokens)
list(trigram)
4.nltk.ngrams(tokens, n)
如果要求窮舉四連詞甚至更長的多詞組,則可以用統一的函數ngrams(tokens, n),其中n表示n詞詞組, 該函數表達形式較統一,效果如下代碼:
nltk.ngrams(tokens, 2)
list(nltk.ngrams(tokens,2))
5.ConditionalFreqDist條件機率分布函數 可以檢視每個單詞在各新聞語料中出現的次數
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
data = nltk.ConditionalFreqDist((word.lower(), tag)
for (word, tag) in brown_news_tagged)
for word in data.conditions():
if len(data[word]) > 3:
tags = data[word].keys()
print (word, ' '.join(tags))
6.搜檢視跟随詞的詞性标記: 檢視‘often’後面跟随的詞的詞性分布
brown_lrnd_tagged = nltk.corpus.brown.tagged_words(tagset='universal')
tags = [b[1] for (a, b) in list(nltk.bigrams(brown_lrnd_tagged)) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()
out:
VERB ADJ ADP . DET ADV NOUN PRT CONJ PRON
209 32 31 23 21 21 4 3 3 2
7.自動标注器
from nltk.corpus import brown
brown_tagger_sents=brown.tagged_sents(categories='news')
brown_sents=brown.sents(categories='news')
tags=[tag for (word,tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()
out:'NN'
8.預設标注器
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)
"NN"出現的次數最多,設定"NN"為預設的詞性, 但是效果不佳
9.正規表達式标注器
patterns = [
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # simple past
(r'.*es$', 'VBZ'), # 3rd singular present
(r'.*ould$', 'MD'), # modals
(r'.*\'s$', 'NN$'), # possessive nouns
(r'.*s$', 'NNS'), # plural nouns
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'.*', 'NN') # nouns (default)
]
regexp_tagger = nltk.RegexpTagger(patterns)
print(regexp_tagger.tag(brown.sents()[3]))
regexp_tagger.evaluate(brown.tagged_sents(categories='news'))
10.查詢标注器
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common()[:100]
likely_tags = dict((word, cfd[word].max()) for (word,freq) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
out:0.45578495136941344
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)
11.N-gram标注
#一進制模型
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
print (unigram_tagger.tag(brown_sents[2007]))
print (unigram_tagger.evaluate(brown_tagged_sents))
#分離訓練與測試資料
size = int(len(brown_tagged_sents) * 0.9)
print (size)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)
out:
4160
0.8121200039868434
#一般N-gram标注
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)
bigram_tagger.evaluate(test_sents)
out:0.10206319146815508
12.儲存标注器
from pickle import dump
output = open('t2.pkl','wb')
dump(t2,output,-1)
output.close()
#加載标注器
from pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()
tagger.tag(brown_sents[22])
13.組合标注器
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)
out:0.8452108043456593
14.跨句子邊界标注
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)
out:0.8452108043456593