Python | 一人之下漫画爬取并保存为pdf文件

最近在看腾讯视频的一人之下4『陈朵篇』，但是这一季只有12集，且已经完结了，对陈朵仍旧充满好奇的我，耐不住下一季了，所以O(∩_∩)O哈哈~

最近在看腾讯视频的一人之下4『陈朵篇』，但是这一季只有12集，且已经完结了，对陈朵仍旧充满好奇的我，耐不住下一季了，所以嘻嘻

本文主人公：

36漫画网

因为这个网站的反爬措施做得还OK，值得表扬，所以我就不一一讲解了，因为这是一个非常简单的爬虫流程，图片还是懒加载，很容易找到。

直接上代码了：

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """获取每章的url链接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """获取每章的图片链接"""
    path, chapters = path_chapters
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        folder_path = '\\'.join([path, name])
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        img_download(folder_path, pic_linking)


def img_download(path, pics):
    """下载图片"""
    num = 1
    print(f"开始下载  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}张")
    for pic in pics:
        print(num, end=' ')
        try:
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except:
            print("出现错误！请等候5s...")
            time.sleep(5)
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        num += 1
    jpg_to_pdf(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"--->>> 正在图片转pdf文件  文件路径{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    _chapter = get_chapter(_url)
    get_pic_linking(_chapter)


if __name__ == '__main__':
    main()

代码运行的时候，可能会报错误

requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)

解决python爬虫requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)问题

为了解决这一问题，同时也为了没必要下载全部章节的需要，我就重整了下代码。

用法：

输入1，则下载1-10话，输入2，则下载11-20话，以此类推......

就每10话为一PDF，不需要下周全部章节了哈哈。

点击查看代码

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """获取每章链接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """获取图片链接"""
    folder_path, chapters = path_chapters
    pics_linking = []
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        pics_linking += pic_linking
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    try:
        img_download(folder_path, pics_linking)
    except:
        print("出错了，请重新尝试o(╥﹏╥)o")
        shutil.rmtree(folder_path)


def img_download(path, pics):
    """下载图片"""
    num = 1
    row = list(range(1, 30))
    print(f"开始下载  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}张")
    for pic in pics:
        print(num, end=' ')
        if num//30 in row:
            print()
            row.pop(0)
        try:
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except Exception as e:
            print("出现错误！请耐心等待5s！")
            time.sleep(5)
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        num += 1
    jpg_to_pdf(path)
    shutil.rmtree(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"\n--->>> 正在图片转pdf文件  文件路径{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def select_section(section, chapters):
    """选择下载范围"""
    sec = int(section)
    name = f'{(sec - 1) * 10+1}-{sec * 10}'
    if sec * 10 > len(chapters[1])+14:
        print(f"漫画一共才更到{len(chapters[1])+4}话，你想下载{(sec-1)*10+1}-{sec*10},你有毛病吧！")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意，缺少425-428话！")
    elif sec*10 < len(chapters[1])+4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫画一共才更到{len(chapters[1])+4}话，所以只能下载{(sec-1)*10+1}-{len(chapters[1])+4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec-1)*10-4:]
        name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
    return chapters[0]+f"\\{name}章", chapter


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    print("输入1，则下载1-10话，输入2，则下载11-20话，以此类推......")
    _section = input("请输入指定数字：")
    _chapter = get_chapter(_url)
    _chapters = select_section(_section, _chapter)
    get_pic_linking(_chapters)


if __name__ == '__main__':
    main()

因为这个网站，少了

425-428

的章节，见下图

所以使用了一个函数做判断（若网站以后更新有了这些章节，小伙伴们可自行更改喔，或者私信给我哈）：

def select_section(section, chapters):
    """选择下载范围"""
    sec = int(section)
    name = f'{(sec - 1) * 10+1}-{sec * 10}'
    if sec * 10 > len(chapters[1])+14:
        print(f"漫画一共才更到{len(chapters[1])+4}话，你想下载{(sec-1)*10+1}-{sec*10},你有毛病吧！")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意，缺少425-428话！")
    elif sec*10 < len(chapters[1])+4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫画一共才更到{len(chapters[1])+4}话，所以只能下载{(sec-1)*10+1}-{len(chapters[1])+4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec-1)*10-4:]
        name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
    return chapters[0]+f"\\{name}章", chapter

进度条展示

点击查看代码

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """获取每章链接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """获取图片链接"""
    folder_path, chapters = path_chapters
    pics_linking = []
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        pics_linking += pic_linking
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    try:
        img_download(folder_path, pics_linking)
    except Exception as e:
        print(e)
        print("出错了，请重新尝试o(╥﹏╥)o")
        shutil.rmtree(folder_path)


def img_download(path, pics):
    """下载图片"""
    print(f"开始下载  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}张")
    for num, pic in enumerate(pics):
        print(f'\r{"▇" * ((num + 1) // 2)} {(num + 1) / len(pics) * 100:.0f}%', end='')
        try:
            with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except Exception as e:
            time.sleep(5)
            with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
    jpg_to_pdf(path)
    shutil.rmtree(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"\n--->>> 正在图片转pdf文件  文件路径{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def select_section(section, chapters):
    """选择下载范围"""
    sec = int(section)
    name = f'{(sec - 1) * 10 + 1}-{sec * 10}'
    if sec * 10 > len(chapters[1]) + 14:
        print(f"漫画一共才更到{len(chapters[1]) + 4}话，你想下载{(sec - 1) * 10 + 1}-{sec * 10},你有毛病吧！")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意，缺少425-428话！")
    elif sec * 10 < len(chapters[1]) + 4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫画一共才更到{len(chapters[1]) + 4}话，所以只能下载{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec - 1) * 10 - 4:]
        name = f"{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}"
    return chapters[0] + f"\\{name}章", chapter


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    print("输入1，则下载1-10话，输入2，则下载11-20话，以此类推......")
    _section = input("请输入指定数字：")
    _chapter = get_chapter(_url)
    _chapters = select_section(_section, _chapter)
    get_pic_linking(_chapters)


if __name__ == '__main__':
    main()

我用的是

PyCharm

运行的，貌似用自带的

IDLE

不可以,怪怪的

Python | 一人之下漫画爬取并保存为pdf文件

继续阅读

UESTC 1269 ZhangYu Speech

selenium 自动抢课——电子科大自动抢课脚本前言：使用方法：`代码：

Compile workrave under windows &ndash; My exprience 在Windows上编译Workrave

门户通专访草根站长九天狼：做站贵在坚持

阿里巴巴Double分布式服务框架

tabpanel 使用问题

为什么把CSS放头部，script放下面

linux下的完美网银们（google chrome, ubuntu10.04）

CSS之折叠菜单

[Linux] diff 查找文件的异同

web开发之前后端渲染

IDEA以http形式clone代码连接超时

403 Forbidden，You don't have permission to access / on this server.Forbidden

最小化DevOps自动化流程(Golang)

Git学习笔记5 merge冲突时二选一

vue-cli简介（中文翻译）