爬取高中單詞
import re
import codecs
from urllib import request, error
from bs4 import BeautifulSoup
def askurl(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
req = request.Request(url=url, headers=headers)
respond = request.urlopen(req)
html = respond.read().decode('utf-8')
return html
except error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
FindContent = re.compile(r'<p>(.*?)</p>', re.S)
def getdata():
datalist = []
baseurl = 'http://www.1mpi.com/doc/eea782580808987333652d93/'
for i in range(21):
url = baseurl + str(i+1)
html = askurl(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', {'class':'contents', "id":"contents"}):
item = str(item)
content = FindContent.findall(item)
content = [i.replace('△', '') for i in content]
try:
if i!=18:
del content[0], content[1], content[2]
del content[0]
except Exception as reason:
print(i, reason, content)
content[0] = re.sub('必修一 UNIT.{2}', '', content[0])
datalist.extend(content)
return datalist
def savedata(savepath):
datalist = getdata()
with codecs.open(savepath, 'w', 'utf-8') as file:
for i in range(len(datalist)):
file.write(datalist[i])
def main():
savedata('d:\\high school word.txt')
if __name__=='__main__':
main()
爬取計算機專業核心單詞(資料未清洗,清洗過程在下一篇繪圖部落格中)
import re
import codecs
from urllib import request, error
from bs4 import BeautifulSoup
def askurl(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
req = request.Request(url=url, headers=headers)
respond = request.urlopen(req)
html = respond.read().decode('utf-8')
return html
except error.URLError as e:
if hasattr(e, 'code'):
print(e.code)
if hasattr(e, 'reason'):
print(e.reason)
FindContent = re.compile(r'<p>(.*?)</p>', re.S)
def getdata():
url = 'https://www.hujiang.com/c/kyyych/p1273859/'
html = askurl(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', {'class':'article-content', "id":"J-article-content"}):
item = str(item)
content = FindContent.findall(item)
return content
def savedata(savepath):
datalist = getdata()
print(datalist)
with codecs.open(savepath, 'w', 'utf-8') as file:
for i in range(len(datalist)):
file.write(datalist[i])
def main():
savedata('d:\\computer major words.txt')
if __name__=='__main__':
main()