package com.smilezl.scrapy;
import java.io.bufferedreader;
import java.io.ioexception;
import java.io.inputstreamreader;
import java.net.httpurlconnection;
import java.net.url;
import java.sql.connection;
import java.sql.drivermanager;
import java.sql.statement;
import java.util.arraylist;
import java.util.list;
import java.util.regex.matcher;
import java.util.regex.pattern;
public class scrapyurl {
/**
* 解析网页链接
* @param htmlurl
* @throws ioexception
*/
public static list<string> parserhtml(string htmlurl) {
list<string> list = new arraylist<string>();
try {
url url = new url(htmlurl);
httpurlconnection connection = (httpurlconnection) url.openconnection();
connection.setdooutput(true);
string contenttype = connection.getcontenttype();
string charset = getcharset(contenttype);
if (charset == null)
charset = "utf-8";
inputstreamreader isr = new inputstreamreader(connection.getinputstream(), charset);
bufferedreader br = new bufferedreader(isr);
string str = null, rs = null;
while ((str = br.readline()) != null) {
rs = gethref(str, htmlurl);
if (rs != null && !list.contains(rs))
list.add(rs);
}
} catch (ioexception e) {
// todo auto-generated catch block
e.printstacktrace();
return list;
* 获取网页编码方式
* @param str
* @return
public static string getcharset(string str) {
pattern pattern = pattern.compile("charset=.*");
matcher matcher = pattern.matcher(str);
if (matcher.find()) {
return matcher.group(0).split("charset=")[1];
return null;
* 从一行字符串中读取链接
public static string gethref(string str, string htmlurl) {
string patternstr = "(http://|https://){1}[\\w\\.\\-/:]+";
//string patternstr = "[^\\s]*((<\\s*[aa]\\s+(href\\s*=[^>]+\\s*)>)(.*)</[aa]>).*";
pattern pattern = pattern.compile(patternstr);
if (matcher.find()){
return matcher.group(0);
} else {
//相对位置截取
string relpatternstr = "href=\"/.*(html){1}";
pattern = pattern.compile(relpatternstr);
matcher = pattern.matcher(str);
return matcher.group(0).replace("href=\"/", htmlurl);
* 保存链接
* @param url
public static void saveurllist(string hrefurl) {
class.forname("org.postgresql.driver").newinstance();
string url = "jdbc:postgresql://localhost:5432/mydb?useunicode=true&amp;characterencoding=gbk";
connection con = drivermanager.getconnection(url, "postgres", "password");
statement st = con.createstatement();
list<string> list = parserhtml(hrefurl);
for (int i = 0; i < list.size(); i++) {
string sql = "insert into scrapyurl(url,type) values(‘" + list.get(i) + "‘,0)";
system.out.println(list.get(i));
st.execute(sql);
st.close();
con.close();
} catch (exception e) {
public static void main(string[] args) {
saveurllist("http://fo.ifeng.com/fojiaomeiwen/list_0/0.shtml");