天天看點

使用HTTPURLConnection模拟登陸,爬取網頁内容

如果你需要爬取某些網頁的内容,但這些網站需要登入,那就需要一些額外的步驟來由程式來完成這些登入并爬取我們需要的網頁内容了,任意登入頁面都是向伺服器發送請求,如果我們能夠模拟向伺服器發送請求,那麼自然登入也就不在話下,通過Fiddler抓取我們需要的一些資訊,很輕松的就能模拟出向伺服器發送的請求,下面我們可以使用HTTPURLConnection進行模拟登陸并爬取我們需要的網頁内容。

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.List;
import java.util.Map.Entry;
 
public class INotesPost {
 
    public static void main(String[] args) throws Exception {
        String surl = "***?login";
        URL url = new URL(surl);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
 
        connection.setDoOutput(true);
        connection.setDoInput(true);
        connection.setRequestMethod("POST");
        connection.setUseCaches(false);
        connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
        connection.setRequestProperty("User-Agent","Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.1.4322; .NET4.0C; .NET4.0E)");
        connection.setRequestProperty("Accept-Language","zh-CN");
        connection.setRequestProperty("Accept-Encoding","gzip, deflate");
 
        OutputStreamWriter out = new OutputStreamWriter(
                connection.getOutputStream(), "UTF-8");
        // 其中的memberName和password可通過fiddler來抓取
        out.write("username=***&password=***"); 
        out.flush();
        out.close();
 
        connection.connect();
        
        InputStream in = connection.getInputStream();
 
        StringBuilder retStr = new StringBuilder();
        BufferedReader br = new BufferedReader(new InputStreamReader(in));
        String temp = br.readLine();
        while (temp != null) {
            retStr.append(temp);
            temp = br.readLine();
        }
        br.close();
        in.close();
 
        System.out.println(retStr);
        for(Entry<String, List<String>> header: connection.getHeaderFields().entrySet()){
               System.out.println(header.getKey() +" " + header.getValue());  
          }
        
        
    }
}      

在模拟登陸的時候,我們其實可以通過Fiddler來抓取網頁送出參數,直接将Cookie寫到我們的Connection的RequestProperty中去。

Fiddler抓取登入參數

使用HTTPURLConnection模拟登陸,爬取網頁内容

将抓取到的參數直接填充到Connection的RequestProperty屬性中去,輕松抓取網頁内容。如果我們抓取的頁面内容是中文的,注意charset的編碼方式,并在讀取頁面傳回的字元流時進行對應的編碼:

BufferedReader bufferedReader = new BufferedReader(  
  
        new InputStreamReader(urlStream,"utf-8"));       

下面是一段相對完整的代碼

String s = "****";  
  
        url = new URL(s);  
        HttpURLConnection resumeConnection = (HttpURLConnection) url.openConnection();  
  
                  
        resumeConnection.setRequestProperty("Accept-Charset","utf-8");  
        resumeConnection.setRequestProperty("Content-Type","text/html;utf-8");  
        resumeConnection.setRequestProperty("Cookie","AttachmentAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NjgxMDQzODc3NDA0OCxUcnVlLEV0eHBYWVlYVHNYQ0hYR3hjRmZjdWowOXV6ekRXc01Hd0FLUzVkaFNmcEErcWo4S3pGTUYvYVRYZFJnWitSRW1pVmR4N0xKVzdoOUhzMitUamY5Z0E2VHY4a2hxeHNTQXlVRmhmQ1pwelBUOFBWQmc0NXI2cHo4eGZxZkEyNzAyOUo0eFBrcU9MM0dWNm1IVGdVNEZFT3E1OVIzSHA3dmZrS0tHR1YxNVJpTllKcXF1dUVCMmhlU1lGT0VLUjlBMitEQ00rMVlwdXBVTEJ0UGdWYk5lODBobEtydUttc1MyWWkrSmpXMFozTVVyRHJzN1VkU1VxNmdrYmo0dTB4OWNrTXRFZXJ1cUlZbDROb3N2UWhpSmNRTlVGcm9kNkVXaWhBL0tjUVpaZlY1UFJBREtjalZIYmx3dnRXMkIwZ1VPMVM3REJFa0VzOS9GQUViVzM2bnhJQT09LGh0dHA6Ly9vYS5zZGMuaWNiYzo4Mi9zdG9yYWdlL2F0dGFjaG1lbnQyLzIwMTUtMDUvZTY1Yjc3ZjUtNGZkMC00NDI2LWE1OWYtMjQxNTAxYWE0MjI1L+mZhOS7tjIu6L2v5Lu25byA5Y+R5Lit5b+D6K665paH5L2T5L6L6KaB5rGCLmRvYzwvU1A+");  
        resumeConnection.setRequestProperty("Cookie","PortalAuth=77u/PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz48U1A+MCMuZnxpaWNwfDAwMDgyMzkwNSwwIy5mfGlpY3B8MDAwODIzOTA1LDEzMDc2NzA1NzM3NTI3MDY4NCxUcnVlLFFldU1Fa2xDelI0bEZaTTJkbVVtZGxPVmhsUVdwQWMzQlk2TCtWdlVOb1ZsRjVHZ1BMRVhMTTAwcHBKWW5WTGZLYzFPTTh2aGRydmRIVWVLR3JOb255dWpTS2lMeEhyQUlBbmtYZTVBTWlFVGpFMlF4bzRjWVRKeEhjNU5ScEhMSWJOWHdWckFTWHhuNUd5bURST0xTK2d3cUFWbThFUllPM3J1enR4aGgwT1VrTDJGMGkrUDdWcHViRm84blFrTXp4MFNyMXdtQzE3UEJkcGpGVU1nOW8xRkJoeHhzWElDdHhLVEpVSHRGMmpDNmNKS285bGJtTXZJZnlwR0k1VGpLd29TTUpaenhyb1BkQ3VOVW13Wk01T0ZEUExSK1lqajVCRitJSFc1enV0UlpXM08wWHhNaldIWk1nWHhncjF0dUc1b3E3RlRwOGhCMFVCWjAydDlGQT09LGh0dHA6Ly9vYS5zZGMuaWNiYy88L1NQPg==");  
        resumeConnection.connect();  
  
        InputStream urlStream = resumeConnection.getInputStream();  
  
        BufferedReader bufferedReader = new BufferedReader(  
  
        new InputStreamReader(urlStream,"utf-8"));  
  
        String ss = null;  
        StringBuilder total = new StringBuilder();  
        while ((ss = bufferedReader.readLine()) != null) {  
            total.append(ss);  
        }  
        bufferedReader.close();       
        resumeConnection.disconnect();  
          
//      System.out.println(total.toString());      

轉載:http://blog.csdn.net/kangkanglou/article/details/45895407