天天看點

java之網絡爬蟲代碼

java網絡爬蟲爬取網站代碼

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;

public class WebSpiderTest {
	

	public static void main(String[] args) {
		try {
			URL url = new URL("http://www.163.com");
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				System.out.println(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		j

	}

}
           

将上一個代碼進行封裝

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 獲得url對應的網頁源碼内容
 * @author hejun
 *
 */
public class WebSpiderTest {
	public static String getURLContent(String urlStr,String charset) {
		StringBuilder sb = new StringBuilder();
		try {
			URL url = new URL(urlStr);
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				sb.append(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return sb.toString();
	}

	public static void main(String[] args) {
		String destStr = getURLContent("http://www.163.com","gbk");
		System.out.println(destStr);
		}
		}
		}
           

擷取網頁的超連接配接,及網頁亂碼處理

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 獲得url對應的網頁源碼内容
 * @author hejun
 *
 */
public class WebSpiderTest {
	public static String getURLContent(String urlStr,String charset) {
		StringBuilder sb = new StringBuilder();
		try {
			URL url = new URL(urlStr);
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				sb.append(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return sb.toString();
	}

	public static void main(String[] args) {
		String destStr = getURLContent("http://www.163.com","gbk");
		System.out.println(destStr);
		//取超連接配接的整個内容
		//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
		//取到的超連結的位址
		Pattern p = Pattern.compile("href=\"(.+?)\"");
		Matcher m = p.matcher(destStr);
		
		while(m.find()) {
			System.out.println(m.group());
			System.out.println(m.group(1));
		}
		
		

	}

}
           

将上一個代碼進行優化

package Demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 獲得url對應的網頁源碼内容
 * @author hejun
 *
 */
public class WebSpiderTest {
	public static String getURLContent(String urlStr,String charset) {
		StringBuilder sb = new StringBuilder();
		try {
			URL url = new URL(urlStr);
			BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
			String temp =" ";
			while((temp=reader.readLine())!=null) {
				sb.append(temp);
			}
			
		} catch (MalformedURLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return sb.toString();
	}
	
	public static List<String>getMatherSubstrs(String destStr,String regexStr){
		//取到的超連結的位址
		Pattern p = Pattern.compile(regexStr);
		Matcher m = p.matcher(destStr);
		List<String> result = new ArrayList<String>();
		while(m.find()) {
			
			result .add(m.group(1));
			
		}

		return result;
		
	}

	public static void main(String[] args) {
		String destStr = getURLContent("http://www.163.com","gbk");
		System.out.println(destStr);
		//取超連接配接的整個内容
		//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
		List<String> result=getMatherSubstrs(destStr,"href=\"([\\w\\s./:]+?)\"");
		for(String t:result) {
			System.out.println(t);
		}
	
		
		

	}

}

           

喜歡我的可以關注我,我們可以一起交流學習

微信公衆号:

讓我愛上它Computer

qq群:473989408