天天看点

php 协程 爬虫,go语言 grequests+goquery 简单爬虫,使用多协程并发爬取

package main

import (

"fmt"

//go语言版本的jquery

"github.com/PuerkitoBio/goquery"

"os"

"sync"

"strings"

//go语言版本的request

"github.com/levigross/grequests"

"time"

"strconv"

)

var wg sync.WaitGroup

func main() {

now := time.Now()

initalUrls := []string{"http://www.zngirls.com/girl/18071/album/", }

for _, url := range initalUrls {

doc, err := goquery.NewDocument(url)

if err != nil {

fmt.Errorf("下载错误:%#v", err)

os.Exit(-1)

}

doc.Find(".igalleryli_link").Each(func(i int, s *goquery.Selection) {

src, exists := s.Find("img").Attr("src")

fmt.Printf("开始下载影集图片:%v\n", src)

if (exists) {

wg.Add(1)

go func(src string) {

defer wg.Done()

//下载图片

//tryTimes := map[int]int

n := 0

s := strings.Replace(src, "cover/", "", 1)

ss := strings.Split(s, "/")

fm := strings.Join(ss[:len(ss) - 1], "/")

sf0 := fm + "/%d.jpg"

sfn := fm + "/%03d.jpg"

for {

//持续下载

s := ""

if n == 0 {

s = fmt.Sprintf(sf0, n)

} else {

s = fmt.Sprintf(sfn, n)

}

fmt.Printf("准备下载: %v\n", s)

res, _ := grequests.Get(s, &grequests.RequestOptions{

//结构体可以对指定的类型给值,而不一定都赋值

Headers:map[string]string{

"Referer":"http://www.zngirls.com",

"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}})

//条件需要修改,如果没有图片,返回的是盗链,图片4kb

if res.StatusCode != 200 {

fmt.Printf("下载失败,退出影集下载:%s\n", src)

break

}

//图片可能是该网站,返回的盗链图片(4kb左右)

length := res.Header.Get("Content-Length")

slen,_ := strconv.Atoi(length)

if slen < 4100{

fmt.Printf("下载内容失败,退出影集下载:%s\n", src)

break

}

index := strings.Index(s, "gallery")

if index == -1 {

fmt.Errorf("无效地址,找不到gallery关键词,解析失败:%s\n", src)

return

}

ss2 := strings.Split(string(s[index:]), "/")

dirname := strings.Join(ss2[:len(ss2) - 1], "/")

if _, err := os.Stat(dirname); err != nil {

fmt.Printf("创建下载文件夹:%s\n", dirname)

os.MkdirAll(dirname, 0666)

}

filename := strings.Join(ss2, "/")

res.DownloadToFile(filename)

fmt.Printf("成功下载图片到:%s\n", filename)

n++

}

}(src)

}

})

}

wg.Wait()

//4M的带宽下载,需要16m36s,总大小202M,10个文件夹,560个文件

fmt.Printf("下载任务完成,耗时:%#v\n", time.Now().Sub(now))

}

有疑问加站长微信联系(非本文作者)