java網絡爬蟲爬取網站代碼
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
public class WebSpiderTest {
public static void main(String[] args) {
try {
URL url = new URL("http://www.163.com");
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream()));
String temp =" ";
while((temp=reader.readLine())!=null) {
System.out.println(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
j
}
}
将上一個代碼進行封裝
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 獲得url對應的網頁源碼内容
* @author hejun
*
*/
public class WebSpiderTest {
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp =" ";
while((temp=reader.readLine())!=null) {
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
String destStr = getURLContent("http://www.163.com","gbk");
System.out.println(destStr);
}
}
}
擷取網頁的超連接配接,及網頁亂碼處理
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 獲得url對應的網頁源碼内容
* @author hejun
*
*/
public class WebSpiderTest {
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp =" ";
while((temp=reader.readLine())!=null) {
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static void main(String[] args) {
String destStr = getURLContent("http://www.163.com","gbk");
System.out.println(destStr);
//取超連接配接的整個内容
//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
//取到的超連結的位址
Pattern p = Pattern.compile("href=\"(.+?)\"");
Matcher m = p.matcher(destStr);
while(m.find()) {
System.out.println(m.group());
System.out.println(m.group(1));
}
}
}
将上一個代碼進行優化
package Demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 獲得url對應的網頁源碼内容
* @author hejun
*
*/
public class WebSpiderTest {
public static String getURLContent(String urlStr,String charset) {
StringBuilder sb = new StringBuilder();
try {
URL url = new URL(urlStr);
BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(),Charset.forName("gbk")));
String temp =" ";
while((temp=reader.readLine())!=null) {
sb.append(temp);
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return sb.toString();
}
public static List<String>getMatherSubstrs(String destStr,String regexStr){
//取到的超連結的位址
Pattern p = Pattern.compile(regexStr);
Matcher m = p.matcher(destStr);
List<String> result = new ArrayList<String>();
while(m.find()) {
result .add(m.group(1));
}
return result;
}
public static void main(String[] args) {
String destStr = getURLContent("http://www.163.com","gbk");
System.out.println(destStr);
//取超連接配接的整個内容
//Pattern p = Pattern.compile("<a[\\s\\S]*?</a>");
List<String> result=getMatherSubstrs(destStr,"href=\"([\\w\\s./:]+?)\"");
for(String t:result) {
System.out.println(t);
}
}
}
喜歡我的可以關注我,我們可以一起交流學習
微信公衆号:
讓我愛上它Computer
qq群:473989408