天天看點

Java網頁爬蟲--基于URLConnection的網頁爬蟲工具類

在這個資料為王的時代,爬蟲應用地越來越廣泛,對于一個萌新程式員來說如果你要做爬蟲,那麼Python是你的不二之選。但是對于那些老臘肉的Java程式員(亦或者你是程式媛)想使用Java做爬蟲也不是不行,隻是沒有Python那麼友善。身為一塊Java老臘肉的我在此記錄一下自己在使用Java做網絡爬蟲使用的工具類。

在pom.xml檔案中引入commons-lang3 依賴:

<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.6</version>
		</dependency>
           

 SpiderHttpUtils 工具類完整代碼如下: 

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;

import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.commons.lang3.StringUtils;

public class SpiderHttpUtils {

	public static String sendGet(boolean isHttps, String requestUrl, Map<String, String> params,
			Map<String, String> headers, String charSet) {
		if (StringUtils.isBlank(requestUrl)) {
			return "";
		}
		if (StringUtils.isBlank(charSet)) {
			charSet = "UTF-8";
		}
		URL url = null;
		URLConnection conn = null;
		BufferedReader br = null;

		try {
			// 建立連接配接
			url = new URL(requestUrl + "?" + requestParamsBuild(params));
			if (isHttps) {
				conn = getHttpsUrlConnection(url);
			} else {
				conn = (HttpURLConnection) url.openConnection();
			}

			// 設定請求頭通用屬性

			// 指定用戶端能夠接收的内容類型
			conn.setRequestProperty("Accept", "*/*");

			// 設定連接配接的狀态為長連接配接
			conn.setRequestProperty("Connection", "keep-alive");

			// 設定發送請求的客戶機系統資訊
			conn.setRequestProperty("User-Agent",
					"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36");

			// 設定請求頭自定義屬性
			if (null != headers && headers.size() > 0) {

				for (Map.Entry<String, String> entry : headers.entrySet()) {
					conn.setRequestProperty(entry.getKey(), entry.getValue());
				}
			}

			// 設定其他屬性
			// conn.setUseCaches(false);//不使用緩存
			// conn.setReadTimeout(10000);// 設定讀取逾時時間
			// conn.setConnectTimeout(10000);// 設定連接配接逾時時間

			// 建立實際連接配接
			conn.connect();

			// 讀取請求結果
			br = new BufferedReader(new InputStreamReader(conn.getInputStream(), charSet));
			String line = null;
			StringBuilder sb = new StringBuilder();
			while ((line = br.readLine()) != null) {
				sb.append(line);
			}
			return sb.toString();
		} catch (Exception exception) {
			return "";
		} finally {
			try {
				if (br != null) {
					br.close();
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

	}

	public static String requestParamsBuild(Map<String, String> map) {
		String result = "";
		if (null != map && map.size() > 0) {
			StringBuffer sb = new StringBuffer();
			for (Map.Entry<String, String> entry : map.entrySet()) {
				try {
					String value = URLEncoder.encode(entry.getValue(), "UTF-8");
					sb.append(entry.getKey() + "=" + value + "&");
				} catch (UnsupportedEncodingException e) {
					e.printStackTrace();
				}
			}

			result = sb.substring(0, sb.length() - 1);
		}
		return result;
	}

	private static HttpsURLConnection getHttpsUrlConnection(URL url) throws Exception {
		HttpsURLConnection httpsConn = (HttpsURLConnection) url.openConnection();
		// 建立SSLContext對象,并使用我們指定的信任管理器初始化
		TrustManager[] tm = { new X509TrustManager() {
			public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {
				// 檢查用戶端證書
			}

			public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {
				// 檢查伺服器端證書
			}

			public X509Certificate[] getAcceptedIssuers() {
				// 傳回受信任的X509證書數組
				return null;
			}
		} };
		SSLContext sslContext = SSLContext.getInstance("SSL", "SunJSSE");
		sslContext.init(null, tm, new java.security.SecureRandom());
		// 從上述SSLContext對象中得到SSLSocketFactory對象
		SSLSocketFactory ssf = sslContext.getSocketFactory();
		httpsConn.setSSLSocketFactory(ssf);
		return httpsConn;

	}

	public static byte[] getFileAsByte(boolean isHttps, String requestUrl) {
		if (StringUtils.isBlank(requestUrl)) {
			return new byte[0];
		}
		URL url = null;
		URLConnection conn = null;
		BufferedInputStream bi = null;

		try {
			// 建立連接配接
			url = new URL(requestUrl);
			if (isHttps) {
				conn = getHttpsUrlConnection(url);
			} else {
				conn = (HttpURLConnection) url.openConnection();
			}

			// 設定請求頭通用屬性

			// 指定用戶端能夠接收的内容類型
			conn.setRequestProperty("accept", "*/*");

			// 設定連接配接的狀态為長連接配接
			conn.setRequestProperty("Connection", "keep-alive");

			// 設定發送請求的客戶機系統資訊
			conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
			// 設定其他屬性
			conn.setConnectTimeout(3000);// 設定連接配接逾時時間

			conn.setDoOutput(true);
			conn.setDoInput(true);

			// 建立實際連接配接
			conn.connect();

			// 讀取請求結果
			bi = new BufferedInputStream(conn.getInputStream());
			ByteArrayOutputStream outStream = new ByteArrayOutputStream();
			byte[] buffer = new byte[2048];
			int len = 0;
			while ((len = bi.read(buffer)) != -1) {
				outStream.write(buffer, 0, len);
			}
			bi.close();
			byte[] data = outStream.toByteArray();
			return data;
		} catch (Exception exception) {
			return new byte[0];
		} finally {
			try {
				if (bi != null) {
					bi.close();
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
		}

	}

}
           

繼續閱讀