天天看点

JAVA抓取网页的图片

import java.io.file;

import java.io.fileoutputstream;

import java.io.inputstream;

import java.net.url;

import java.net.urlconnection;

import java.util.arraylist;

import java.util.list;

import java.util.regex.matcher;

import java.util.regex.pattern;

/***

* java抓取网络图片

* @author swinglife

*

*/

public class catchimage {

// 地址

private static final string url = "http://www.csdn.net";

// 编码

private static final string ecoding = "utf-8";

// 获取img标签正则

private static final string imgurl_reg = "<img.*src=(.*?)[^>]*?>";

// 获取src路径的正则

private static final string imgsrc_reg = "http:\"?(.*?)(\"|>|\\s+)";

public static void main(string[] args) throws exception {

catchimage cm = new catchimage();

//获得html文本内容

string html = cm.gethtml(url);

//获取图片标签

list<string> imgurl = cm.getimageurl(html);

//获取图片src地址

list<string> imgsrc = cm.getimagesrc(imgurl);

//下载图片

cm.download(imgsrc);

}

* 获取html内容

* @param url

* @return

* @throws exception

private string gethtml(string url) throws exception {

url uri = new url(url);

urlconnection connection = uri.openconnection();

inputstream in = connection.getinputstream();

byte[] buf = new byte[1024];

int length = 0;

stringbuffer sb = new stringbuffer();

while ((length = in.read(buf, 0, buf.length)) > 0) {

sb.append(new string(buf, ecoding));

in.close();

return sb.tostring();

* 获取imageurl地址

* @param html

private list<string> getimageurl(string html) {

matcher matcher = pattern.compile(imgurl_reg).matcher(html);

list<string> listimgurl = new arraylist<string>();

while (matcher.find()) {

listimgurl.add(matcher.group());

return listimgurl;

* 获取imagesrc地址

* @param listimageurl

private list<string> getimagesrc(list<string> listimageurl) {

list<string> listimgsrc = new arraylist<string>();

for (string image : listimageurl) {

matcher matcher = pattern.compile(imgsrc_reg).matcher(image);

listimgsrc.add(matcher.group().substring(0, matcher.group().length() - 1));

return listimgsrc;

* 下载图片

* @param listimgsrc

private void download(list<string> listimgsrc) {

try {

for (string url : listimgsrc) {

string imagename = url.substring(url.lastindexof("/") + 1, url.length());

inputstream in = uri.openstream();

fileoutputstream fo = new fileoutputstream(new file(imagename));

system.out.println("开始下载:" + url);

while ((length = in.read(buf, 0, buf.length)) != -1) {

fo.write(buf, 0, length);

fo.close();

system.out.println(imagename + "下载完成");

} catch (exception e) {

system.out.println("下载失败");