import re
from pyltp import NamedEntityRecognizer
from pyltp import SementicRoleLabeller
from pyltp import Parser
from pyltp import Postagger
def ltp_segmentor(sentence):
""" 分割字符串 """
segmentor = Segmentor()
cws_model_path = '..\\ltp_data\\cws.model'
lexicon_path = '..\\ltp_data\\lexicon.txt'
segmentor.load_with_lexicon(cws_model_path, lexicon_path)
segmentor.load(cws_model_path)
words = segmentor.segment(sentence)
segmentor.release()
return list(words)
def extract_data():
parser = Parser() # 初始化模型
postagger = Postagger() # 词性标注
labeller = SementicRoleLabeller() # 语义校色标注
recognizer = NamedEntityRecognizer() # 命名实体识别
model_path = '..\\ltp_data\\pos.model'
lexicon_path = '..\\ltp_data\\posLexicon.txt'
postagger.load_with_lexicon(model_path, lexicon_path) # 加载自定义词性表
labeller.load('..\\ltp_data\\pisrl_win.model') # 加载模型
recognizer.load('..\\ltp_data\\ner.model')
postagger.load('..\\ltp_data\\pos.model')
parser.load('..\\ltp_data\\parser.model')
content = "#•江都建设集团南京分公司南钢项目部安全生产规章制度不落实,作业现场安全管理缺失,安全操作规程不认真执行"
text = re.sub("[#•]", "", content) # 对语句进行预处理
words = sc_fun.ltp_segmentor(text) # 分词
postags = postagger.postag(words)
arcs = parser.parse(words, postags)
netags = recognizer.recognize(words, postags) # 命名实体识别
print(list(netags))
rely_id = [arc.head for arc in arcs]
relation = [arc.relation for arc in arcs] # 关系
heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]
roles = labeller.label(words, postags, arcs)
for i in range(len(words)):
print(i, relation[i], (words[i], heads[i]), postags[i])
for role in roles:
print([role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])])
labeller.release() # 释放模型
parser.release()
postagger.release()
recognizer.release()
if __name__ == '__main__':
extract_data()
结果:
['B-Ni', 'I-Ni', 'I-Ni', 'I-Ni', 'I-Ni', 'I-Ni', 'E-Ni', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O','O', 'O', 'O', 'O', 'O', 'O']
0 ATT ('江都', '集团') ns
1 ATT ('建设', '集团') v
[11, 'A1:(0,9)ADV:(10,10)']
[24, 'A1:(19,21)ADV:(22,23)']
分开封装一样的:
def ltp_segmentor(sentence):
""" 分割字符串 """
segmentor = Segmentor()
segmentor.load('..\\ltp_data\\cws.model')
words = segmentor.segment(sentence)
segmentor.release()
return list(words)
def ltp_parser(words, postags):
parser = Parser()
parser.load('..\\ltp_data\\parser.model')
arcs = parser.parse(words, postags)
parser.release()
return list(arcs)
def ltp_postags(words):
postagger = Postagger()
model_path = '..\\ltp_data\\pos.model'
lexicon_path = '..\\ltp_data\\posLexicon.txt'
postagger.load_with_lexicon(model_path, lexicon_path) # 加载自定义词性表
postagger.load(model_path)
postags = postagger.postag(words)
postagger.release()
return list(postags)