天天看点

java爬虫(获取页面中的书名、作者名、img地址、以及批量下载img)

下载图片

package cn.tedu;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ImgTest {

    // 地址
    private static final String URL = "http://www.ireader.com/index.php?ca=booksort.index&pid=92&cid=142&order=download&status=0&page=3";
    // 获取img标签正则
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    // 获取src路径的正则
    private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*";


    public static void main(String[] args) {
        try {
            ImgTest cm=new ImgTest();
            //获得html文本内容
            String HTML = cm.getHtml(URL);
            //获取图片标签
            List<String> imgUrl = cm.getImageUrl(HTML);
            //获取图片src地址
            List<String> imgSrc = cm.getImageSrc(imgUrl);
            int count=120;
            for (String is : imgSrc) {
            	
            	count++;
            	String path="d:/library/history/"+count+".jpg";
            	downloadPicture( is,path);
				//System.out.println(is);
			}
            //下载图片
            //cm.Download(imgSrc);

        }catch (Exception e){
            System.out.println("发生错误");
        }
        System.out.println("下载成功");

    }

   //获取HTML内容
    private String getHtml(String url)throws Exception{
        URL url1=new URL(url);
        URLConnection connection=url1.openConnection();
        InputStream in=connection.getInputStream();
        InputStreamReader isr=new InputStreamReader(in);
        BufferedReader br=new BufferedReader(isr);

        String line;
        StringBuffer sb=new StringBuffer();
        while((line=br.readLine())!=null){
            sb.append(line,0,line.length());
            sb.append('\n');
        }
        br.close();
        isr.close();
        in.close();
        return sb.toString();
    }

    //获取ImageUrl地址
    private List<String> getImageUrl(String html){
        Matcher matcher=Pattern.compile(IMGURL_REG).matcher(html);
        List<String>listimgurl=new ArrayList<String>();
        while (matcher.find()){
            listimgurl.add(matcher.group());
        }
        return listimgurl;
    }

    //获取ImageSrc地址
    private List<String> getImageSrc(List<String> listimageurl){
        List<String> listImageSrc=new ArrayList<String>();
        for (String image:listimageurl){
            Matcher matcher=Pattern.compile(IMGSRC_REG).matcher(image);
            while (matcher.find()){
                listImageSrc.add(matcher.group().substring(0, matcher.group().length()-1));
            }
        }
        return listImageSrc;
    }

    //链接url下载图片
    private static void downloadPicture(String urlList,String path) {
        URL url = null;
        try {
            url = new URL(urlList);
            DataInputStream dataInputStream = new DataInputStream(url.openStream());
 
            FileOutputStream fileOutputStream = new FileOutputStream(new File(path));
            ByteArrayOutputStream output = new ByteArrayOutputStream();
 
            byte[] buffer = new byte[1024];
            int length;
 
            while ((length = dataInputStream.read(buffer)) > 0) {
                output.write(buffer, 0, length);
            }
            fileOutputStream.write(output.toByteArray());
            dataInputStream.close();
            fileOutputStream.close();
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    }
           

获取书名和作者名

@Test
	public void getMessage() throws Exception {
		String url="http://www.ireader.com/index.php?ca=booksort.index&pid=92&cid=142&order=download&status=0&page=3";
		Connection cn=Jsoup.connect(url);
		Document doc=cn.get();
		Elements el=doc.select(".bookMation h3 a");
		for (Element e : el) {
			String info=e.text();
			System.out.println("书名: "+info);
		}
		Elements el2=doc.select(".tryread");
		for (Element e2 : el2) {
			String info2=e2.text();
			System.out.println("作者: "+info2);
		}
		
		
		
	}