天天看点

抓取网页链接

package com.smilezl.scrapy;

import java.io.bufferedreader;

import java.io.ioexception;

import java.io.inputstreamreader;

import java.net.httpurlconnection;

import java.net.url;

import java.sql.connection;

import java.sql.drivermanager;

import java.sql.statement;

import java.util.arraylist;

import java.util.list;

import java.util.regex.matcher;

import java.util.regex.pattern;

public class scrapyurl {

/**

* 解析网页链接

* @param htmlurl

* @throws ioexception

*/

public static list<string> parserhtml(string htmlurl) {

list<string> list = new arraylist<string>();

try {

url url = new url(htmlurl);

httpurlconnection connection = (httpurlconnection) url.openconnection();

connection.setdooutput(true);

string contenttype = connection.getcontenttype();

string charset = getcharset(contenttype);

if (charset == null)

charset = "utf-8";

inputstreamreader isr = new inputstreamreader(connection.getinputstream(), charset);

bufferedreader br = new bufferedreader(isr);

string str = null, rs = null;

while ((str = br.readline()) != null) {

rs = gethref(str, htmlurl);

if (rs != null && !list.contains(rs))

list.add(rs);

}

} catch (ioexception e) {

// todo auto-generated catch block

e.printstacktrace();

return list;

* 获取网页编码方式

* @param str

* @return

public static string getcharset(string str) {

pattern pattern = pattern.compile("charset=.*");

matcher matcher = pattern.matcher(str);

if (matcher.find()) {

return matcher.group(0).split("charset=")[1];

return null;

* 从一行字符串中读取链接

public static string gethref(string str, string htmlurl) {

string patternstr = "(http://|https://){1}[\\w\\.\\-/:]+";

//string patternstr = "[^\\s]*((<\\s*[aa]\\s+(href\\s*=[^>]+\\s*)>)(.*)</[aa]>).*";

pattern pattern = pattern.compile(patternstr);

if (matcher.find()){

return matcher.group(0);

} else {

//相对位置截取

string relpatternstr = "href=\"/.*(html){1}";

pattern = pattern.compile(relpatternstr);

matcher = pattern.matcher(str);

return matcher.group(0).replace("href=\"/", htmlurl);

* 保存链接

* @param url

public static void saveurllist(string hrefurl) {

class.forname("org.postgresql.driver").newinstance();

string url = "jdbc:postgresql://localhost:5432/mydb?useunicode=true&characterencoding=gbk";

connection con = drivermanager.getconnection(url, "postgres", "password");

statement st = con.createstatement();

list<string> list = parserhtml(hrefurl);

for (int i = 0; i < list.size(); i++) {

string sql = "insert into scrapyurl(url,type) values(‘" + list.get(i) + "‘,0)";

system.out.println(list.get(i));

st.execute(sql);

st.close();

con.close();

} catch (exception e) {

public static void main(string[] args) {

saveurllist("http://fo.ifeng.com/fojiaomeiwen/list_0/0.shtml");

继续阅读