爬蟲爬取單詞

2023-07-31 02:12:30

爬取高中單詞

import re
import codecs
from urllib import request, error
from  bs4 import BeautifulSoup


def askurl(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
        req = request.Request(url=url, headers=headers)
        respond = request.urlopen(req)
        html = respond.read().decode('utf-8')
        return html
    except error.URLError as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)

FindContent = re.compile(r'<p>(.*?)</p>', re.S)

def getdata():
    datalist = []
    baseurl = 'http://www.1mpi.com/doc/eea782580808987333652d93/'
    for i in range(21):
        url = baseurl + str(i+1)
        html = askurl(url)
        soup = BeautifulSoup(html, 'html.parser')
        for item in soup.find_all('div', {'class':'contents', "id":"contents"}):
            item = str(item)
            content = FindContent.findall(item)
            content = [i.replace('△', '') for i in content]
            try:
                if i!=18:
                    del content[0], content[1], content[2]
                    del content[0]
            except Exception as reason:
                print(i, reason, content)
            content[0] = re.sub('必修一 UNIT.{2}', '', content[0])
            datalist.extend(content)
    return datalist




def savedata(savepath):
    datalist = getdata()
    with codecs.open(savepath, 'w', 'utf-8') as file:
        for i in range(len(datalist)):
            file.write(datalist[i])



    



def main():
    savedata('d:\\high school word.txt')




if __name__=='__main__':
    main()

爬取計算機專業核心單詞（資料未清洗，清洗過程在下一篇繪圖部落格中）

import re
import codecs
from urllib import request, error
from  bs4 import BeautifulSoup


def askurl(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
        req = request.Request(url=url, headers=headers)
        respond = request.urlopen(req)
        html = respond.read().decode('utf-8')
        return html
    except error.URLError as e:
        if hasattr(e, 'code'):
            print(e.code)
        if hasattr(e, 'reason'):
            print(e.reason)

FindContent = re.compile(r'<p>(.*?)</p>', re.S)

def getdata():
    url = 'https://www.hujiang.com/c/kyyych/p1273859/'
    html = askurl(url)
    soup = BeautifulSoup(html, 'html.parser')
    for item in soup.find_all('div', {'class':'article-content', "id":"J-article-content"}):
        item = str(item)
        content = FindContent.findall(item)
    return content




def savedata(savepath):
    datalist = getdata()
    print(datalist)
    with codecs.open(savepath, 'w', 'utf-8') as file:
        for i in range(len(datalist)):
            file.write(datalist[i])



    



def main():
    savedata('d:\\computer major words.txt')




if __name__=='__main__':
    main()

爬蟲爬取單詞

繼續閱讀

失效模式與效應分析：Failure Mode and Effect Analysis

XMLHttpRequest 對象基礎知識

[轉] 擷取實時股票資料與股票資料接口API

CSS樣式-手勢

一個不錯的 js 校驗

python 正則判斷字元串是否為版本号

【崔慶才教材】《Python3網絡爬蟲開發實戰》3.4爬取貓眼電影排行代碼更正（繞過美團驗證碼）

個人覺得C++BuilderX是個失敗的作品

力扣每日一題：65. 有效數字題目：65. 有效數字解題思路

SQL注入風險小例

比較Flash AS3與AS2特性與功能

windows phone7 下 Silverlight 異步讀取網絡圖檔

LR中檢查點函數簡析

GSL--GNU Scientific Library

不用iconv函數實作UTF-8編碼轉換GB2312的PHP函數

neo4j之cypher使用文檔