在這個資料為王的時代,爬蟲應用地越來越廣泛,對于一個萌新程式員來說如果你要做爬蟲,那麼Python是你的不二之選。但是對于那些老臘肉的Java程式員(亦或者你是程式媛)想使用Java做爬蟲也不是不行,隻是沒有Python那麼友善。身為一塊Java老臘肉的我在此記錄一下自己在使用Java做網絡爬蟲使用的工具類。
在pom.xml檔案中引入commons-lang3 依賴:
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.6</version>
</dependency>
SpiderHttpUtils 工具類完整代碼如下:
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import org.apache.commons.lang3.StringUtils;
public class SpiderHttpUtils {
public static String sendGet(boolean isHttps, String requestUrl, Map<String, String> params,
Map<String, String> headers, String charSet) {
if (StringUtils.isBlank(requestUrl)) {
return "";
}
if (StringUtils.isBlank(charSet)) {
charSet = "UTF-8";
}
URL url = null;
URLConnection conn = null;
BufferedReader br = null;
try {
// 建立連接配接
url = new URL(requestUrl + "?" + requestParamsBuild(params));
if (isHttps) {
conn = getHttpsUrlConnection(url);
} else {
conn = (HttpURLConnection) url.openConnection();
}
// 設定請求頭通用屬性
// 指定用戶端能夠接收的内容類型
conn.setRequestProperty("Accept", "*/*");
// 設定連接配接的狀态為長連接配接
conn.setRequestProperty("Connection", "keep-alive");
// 設定發送請求的客戶機系統資訊
conn.setRequestProperty("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");
// 設定請求頭自定義屬性
if (null != headers && headers.size() > 0) {
for (Map.Entry<String, String> entry : headers.entrySet()) {
conn.setRequestProperty(entry.getKey(), entry.getValue());
}
}
// 設定其他屬性
// conn.setUseCaches(false);//不使用緩存
// conn.setReadTimeout(10000);// 設定讀取逾時時間
// conn.setConnectTimeout(10000);// 設定連接配接逾時時間
// 建立實際連接配接
conn.connect();
// 讀取請求結果
br = new BufferedReader(new InputStreamReader(conn.getInputStream(), charSet));
String line = null;
StringBuilder sb = new StringBuilder();
while ((line = br.readLine()) != null) {
sb.append(line);
}
return sb.toString();
} catch (Exception exception) {
return "";
} finally {
try {
if (br != null) {
br.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static String requestParamsBuild(Map<String, String> map) {
String result = "";
if (null != map && map.size() > 0) {
StringBuffer sb = new StringBuffer();
for (Map.Entry<String, String> entry : map.entrySet()) {
try {
String value = URLEncoder.encode(entry.getValue(), "UTF-8");
sb.append(entry.getKey() + "=" + value + "&");
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
result = sb.substring(0, sb.length() - 1);
}
return result;
}
private static HttpsURLConnection getHttpsUrlConnection(URL url) throws Exception {
HttpsURLConnection httpsConn = (HttpsURLConnection) url.openConnection();
// 建立SSLContext對象,并使用我們指定的信任管理器初始化
TrustManager[] tm = { new X509TrustManager() {
public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
// 檢查用戶端證書
}
public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
// 檢查伺服器端證書
}
public X509Certificate[] getAcceptedIssuers() {
// 傳回受信任的X509證書數組
return null;
}
} };
SSLContext sslContext = SSLContext.getInstance("SSL", "SunJSSE");
sslContext.init(null, tm, new java.security.SecureRandom());
// 從上述SSLContext對象中得到SSLSocketFactory對象
SSLSocketFactory ssf = sslContext.getSocketFactory();
httpsConn.setSSLSocketFactory(ssf);
return httpsConn;
}
public static byte[] getFileAsByte(boolean isHttps, String requestUrl) {
if (StringUtils.isBlank(requestUrl)) {
return new byte[0];
}
URL url = null;
URLConnection conn = null;
BufferedInputStream bi = null;
try {
// 建立連接配接
url = new URL(requestUrl);
if (isHttps) {
conn = getHttpsUrlConnection(url);
} else {
conn = (HttpURLConnection) url.openConnection();
}
// 設定請求頭通用屬性
// 指定用戶端能夠接收的内容類型
conn.setRequestProperty("accept", "*/*");
// 設定連接配接的狀态為長連接配接
conn.setRequestProperty("Connection", "keep-alive");
// 設定發送請求的客戶機系統資訊
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
// 設定其他屬性
conn.setConnectTimeout(3000);// 設定連接配接逾時時間
conn.setDoOutput(true);
conn.setDoInput(true);
// 建立實際連接配接
conn.connect();
// 讀取請求結果
bi = new BufferedInputStream(conn.getInputStream());
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
byte[] buffer = new byte[2048];
int len = 0;
while ((len = bi.read(buffer)) != -1) {
outStream.write(buffer, 0, len);
}
bi.close();
byte[] data = outStream.toByteArray();
return data;
} catch (Exception exception) {
return new byte[0];
} finally {
try {
if (bi != null) {
bi.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}