天天看点

Python | 一人之下漫画爬取并保存为pdf文件

最近在看腾讯视频的一人之下4『陈朵篇』,但是这一季只有12集,且已经完结了,对陈朵仍旧充满好奇的我,耐不住下一季了,所以O(∩_∩)O哈哈~

最近在看腾讯视频的一人之下4『陈朵篇』,但是这一季只有12集,且已经完结了,对陈朵仍旧充满好奇的我,耐不住下一季了,所以嘻嘻

Python | 一人之下漫画爬取并保存为pdf文件

本文主人公:

​​36漫画网​​

Python | 一人之下漫画爬取并保存为pdf文件

因为这个网站的反爬措施做得还OK,值得表扬,所以我就不一一讲解了,因为这是一个非常简单的爬虫流程,图片还是懒加载,很容易找到。

直接上代码了:

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """获取每章的url链接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """获取每章的图片链接"""
    path, chapters = path_chapters
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        folder_path = '\\'.join([path, name])
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        img_download(folder_path, pic_linking)


def img_download(path, pics):
    """下载图片"""
    num = 1
    print(f"开始下载  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}张")
    for pic in pics:
        print(num, end=' ')
        try:
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except:
            print("出现错误!请等候5s...")
            time.sleep(5)
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        num += 1
    jpg_to_pdf(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"--->>> 正在图片转pdf文件  文件路径{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    _chapter = get_chapter(_url)
    get_pic_linking(_chapter)


if __name__ == '__main__':
    main()      
Python | 一人之下漫画爬取并保存为pdf文件
Python | 一人之下漫画爬取并保存为pdf文件

代码运行的时候,可能会报错误​

​requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)​

​​​解决python爬虫requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)问题​​

Python | 一人之下漫画爬取并保存为pdf文件

为了解决这一问题,同时也为了没必要下载全部章节的需要,我就重整了下代码。

用法:​​

​输入1,则下载1-10话,输入2,则下载11-20话,以此类推......​

就每10话为一PDF,不需要下周全部章节了哈哈。

点击查看代码

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """获取每章链接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """获取图片链接"""
    folder_path, chapters = path_chapters
    pics_linking = []
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        pics_linking += pic_linking
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    try:
        img_download(folder_path, pics_linking)
    except:
        print("出错了,请重新尝试o(╥﹏╥)o")
        shutil.rmtree(folder_path)


def img_download(path, pics):
    """下载图片"""
    num = 1
    row = list(range(1, 30))
    print(f"开始下载  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}张")
    for pic in pics:
        print(num, end=' ')
        if num//30 in row:
            print()
            row.pop(0)
        try:
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except Exception as e:
            print("出现错误!请耐心等待5s!")
            time.sleep(5)
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        num += 1
    jpg_to_pdf(path)
    shutil.rmtree(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"\n--->>> 正在图片转pdf文件  文件路径{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def select_section(section, chapters):
    """选择下载范围"""
    sec = int(section)
    name = f'{(sec - 1) * 10+1}-{sec * 10}'
    if sec * 10 > len(chapters[1])+14:
        print(f"漫画一共才更到{len(chapters[1])+4}话,你想下载{(sec-1)*10+1}-{sec*10},你有毛病吧!")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意,缺少425-428话!")
    elif sec*10 < len(chapters[1])+4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫画一共才更到{len(chapters[1])+4}话,所以只能下载{(sec-1)*10+1}-{len(chapters[1])+4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec-1)*10-4:]
        name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
    return chapters[0]+f"\\{name}章", chapter


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    print("输入1,则下载1-10话,输入2,则下载11-20话,以此类推......")
    _section = input("请输入指定数字:")
    _chapter = get_chapter(_url)
    _chapters = select_section(_section, _chapter)
    get_pic_linking(_chapters)


if __name__ == '__main__':
    main()
      

因为这个网站,少了​

​425-428​

​的章节,见下图

Python | 一人之下漫画爬取并保存为pdf文件

所以使用了一个函数做判断(若网站以后更新有了这些章节,小伙伴们可自行更改喔,或者私信给我哈):

def select_section(section, chapters):
    """选择下载范围"""
    sec = int(section)
    name = f'{(sec - 1) * 10+1}-{sec * 10}'
    if sec * 10 > len(chapters[1])+14:
        print(f"漫画一共才更到{len(chapters[1])+4}话,你想下载{(sec-1)*10+1}-{sec*10},你有毛病吧!")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意,缺少425-428话!")
    elif sec*10 < len(chapters[1])+4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫画一共才更到{len(chapters[1])+4}话,所以只能下载{(sec-1)*10+1}-{len(chapters[1])+4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec-1)*10-4:]
        name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
    return chapters[0]+f"\\{name}章", chapter      
Python | 一人之下漫画爬取并保存为pdf文件
Python | 一人之下漫画爬取并保存为pdf文件
Python | 一人之下漫画爬取并保存为pdf文件

进度条展示

点击查看代码

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """获取每章链接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """获取图片链接"""
    folder_path, chapters = path_chapters
    pics_linking = []
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        pics_linking += pic_linking
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    try:
        img_download(folder_path, pics_linking)
    except Exception as e:
        print(e)
        print("出错了,请重新尝试o(╥﹏╥)o")
        shutil.rmtree(folder_path)


def img_download(path, pics):
    """下载图片"""
    print(f"开始下载  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}张")
    for num, pic in enumerate(pics):
        print(f'\r{"▇" * ((num + 1) // 2)} {(num + 1) / len(pics) * 100:.0f}%', end='')
        try:
            with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except Exception as e:
            time.sleep(5)
            with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
    jpg_to_pdf(path)
    shutil.rmtree(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"\n--->>> 正在图片转pdf文件  文件路径{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def select_section(section, chapters):
    """选择下载范围"""
    sec = int(section)
    name = f'{(sec - 1) * 10 + 1}-{sec * 10}'
    if sec * 10 > len(chapters[1]) + 14:
        print(f"漫画一共才更到{len(chapters[1]) + 4}话,你想下载{(sec - 1) * 10 + 1}-{sec * 10},你有毛病吧!")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意,缺少425-428话!")
    elif sec * 10 < len(chapters[1]) + 4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫画一共才更到{len(chapters[1]) + 4}话,所以只能下载{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec - 1) * 10 - 4:]
        name = f"{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}"
    return chapters[0] + f"\\{name}章", chapter


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    print("输入1,则下载1-10话,输入2,则下载11-20话,以此类推......")
    _section = input("请输入指定数字:")
    _chapter = get_chapter(_url)
    _chapters = select_section(_section, _chapter)
    get_pic_linking(_chapters)


if __name__ == '__main__':
    main()
      
Python | 一人之下漫画爬取并保存为pdf文件
Python | 一人之下漫画爬取并保存为pdf文件

我用的是​

​PyCharm​

​​运行的,貌似用自带的​

​IDLE​

​不可以,怪怪的

Python | 一人之下漫画爬取并保存为pdf文件