天天看点

python清洗数据去除停用词_python文本处理 数据挖掘 停用词检索

简单描述程序功能:python+flask

1.停用词为csv文件

2.源文件为txt文件

3.文本处理,将原文件中出现的停用词去除

4.根据用户web 表单输入,检索出包含用户输入参数的句子

代码实现:

1.文件读取,分词,源文件词频统计

python 读取 西班牙语文本编码: encoding='ISO-8859-1'

1 #csv 文件读取,此处编码为西班牙语

2 def csvfile():

3 file_path = os.path.join(upload_path, "SpanishStopWords.csv")

4 with open(file_path,'r',encoding='ISO-8859-1') as f:

5 reader = csv.reader(f)

6 fieldnames = next(reader)#获取数据的第一列,作为后续要转为字典的键名 生成器,next方法获取

7 # print(fieldnames)

8 data1=[]

9 csv_reader = csv.DictReader(f,fieldnames=fieldnames) #self._fieldnames = fieldnames # list of keys for the dict 以list的形式存放键名

10 for row in csv_reader:

11 dic1={}

12 for k,v in row.items():

13 dic1[k]=v

14 data1.append(dic1)

15 return data1

16 #txt文件读取

17 def eachcount():

18 file_path = os.path.join(upload_path, "Alamo.txt")

19 txt = open(file_path, 'r', encoding='ISO-8859-1').read()

20 #分词

21 txt = txt.replace(',', ' ').replace('.', ' ')

22 txt = txt.split()

23 counts = {} # 定义一个空字典类型

24 print(txt)

25 for word in txt:

26 counts[word] = counts.get(word, 0) + 1 # 获取word当前有几个,如果word不存在则为0

27 items = list(counts.items())

28 # 对一个列表按照键值对的两个元素的第二个元素进行排序,由大到小的倒排,词频排序

29 items.sort(key=lambda x: x[1], reverse=False)

30 return items

2.显示在原文件中出现的所有停用词

#显示在源文件中出现过的所有停用词@application.route('/listsearch/', methods=['GET', 'POST'])def listsearch(): file_path = os.path.join(upload_path, "SpanishStopWords.csv") txt = open(file_path, 'r', encoding='ISO-8859-1').read() # txt = txt.replace(',', ' ').replace('.', ' ') txt = txt.split() filelist=txt # filelist=csvfile() filelist2=docu2() # wordlist=["my","name","boy","chirs","Dave"] result=[] result2=[] # for j in wordlist: # for i in filelist: # if i[0]== j : # result.append(i) for j in filelist: for i in filelist2: if j== i : result2.append(j) return render_template('index.html',result2=result2)前端代码展现:

search

result

{% for line2 in result2 %}

{{ line2}}

{% endfor %}

3.显示原文件中所有含有数字的句子

1 @application.route('/test1/', methods=['GET', 'POST'])

2 def test1():

3 file_path = os.path.join(upload_path, "Alamo.txt")

4 txt = open(file_path, 'r', encoding='ISO-8859-1').read()

5 # txt = txt.replace(',', ' ').replace('.', ' ')

6 txt = txt.split('.')

7 filelist=txt

8 result2=[]

9 for j in filelist:

10 #使用正则表达式匹配数字

11 if re.match('.*[0-9].*', j) != None:

12 result2.append(j)

13 return render_template('index.html',result9=result2)

4.用户web 表单输入参数,根据用户输入,显示源文件中包含用户输入参数的句子。

1 @application.route('/test2/', methods=['GET', 'POST'])

2 def test2():

3 word = request.args.get("word10")

4 file_path = os.path.join(upload_path, "Alamo.txt")

5 txt = open(file_path, 'r', encoding='ISO-8859-1').read()

6 # txt = txt.replace(',', ' ').replace('.', ' ')

7 txt = txt.split('.')

8 filelist=txt

9 result=[]

10 result2=[]

11 for j in filelist:

12 if word in j :

13 result2.append(j)

14 return render_template('index.html',result10=result2)

前端代码展现:

1

2 submit

3 {% for li in result9 %}

4

{{ li}}

5

6 {% endfor %}

7

8

9

11

13 submit

14 {% for li in result10 %}

15

{{ li}}

16

17 {% endfor %}

18