天天看点

[工具代码]使用Java爬取b站弹幕文件

    • 简介
    • 正文
    • 后记

简介

跳过废话,直接看正文

这是我在很久前用Java编写的爬取b站弹幕文件的工具类,仅供参考。(注:截止于2016年底,凡是url是www.bilibili.com/video/av号/这类网址都能正常工作)

正文

  • Constants
public class Constants {
    public static final String BILIBILI_DANMU_FILE_PATH_PATTERN = "danmures/bilibili" + "/VIDEO_NAME.xml";

    public static final String BILIBILI_DANMU_URL_PATH_PATTERN = "http://comment.bilibili.tv/CID.xml";
}
           
  • getBilibiliDanmuFileByUrl
import java.io.IOException;
import java.io.InputStream;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;

public class ClawUtil {
    public static String getBilibiliDanmuFileByUrl(final String url) {
        HttpClient sHttpClient = new DefaultHttpClient();
        String fileName = null;
        final HttpGet httpGet = new HttpGet(url);
        try {
            // send request
            httpGet.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, );
            final HttpResponse response = sHttpClient.execute(httpGet);
            final HttpEntity entity = response.getEntity();

            // unzip to get htmlString
            String htmlString = StringUtil.gzipInputStreamToUTF8String(entity.getContent());

            // get cid from htmlString
            String cid = "";
            {
                int startIndex = htmlString.indexOf("cid=") + ;
                int endIndex = htmlString.indexOf("&", startIndex);
                cid = htmlString.substring(startIndex, endIndex);
            }

            //get video name from htmlString
            String videoName = "";
            {
                int startIndex = htmlString.indexOf("<title>") + ;
                int endIndex = htmlString.indexOf("</title>", startIndex);
                videoName = StringUtil.removeSpecialChar(htmlString.substring(startIndex, endIndex));
            }

            fileName = Constants.BILIBILI_DANMU_FILE_PATH_PATTERN.replace("VIDEO_NAME", videoName);
            if (!FileUtil.isFileExist(fileName)) {
                String danMuUrl = Constants.BILIBILI_DANMU_URL_PATH_PATTERN.replace("CID", cid);
                HttpUtil.writeRequestEntityIntoFile(Constants.BILIBILI_DANMU_URL_PATH_PATTERN.replace("CID", cid), fileName);
            }
        } catch (final IOException e) {
            e.printStackTrace();
            return null;
        } finally {
            httpGet.abort();
        }
        return fileName;
    }
}
           
  • gzipInputStreamToUTF8String
public static String gzipInputStreamToUTF8String(InputStream is) {
    // unzip to get htmlString
    GZIPInputStream gZipIs;
    StringBuilder sb = new StringBuilder();
    try {
        gZipIs = new GZIPInputStream(is);
        InputStreamReader isr = new InputStreamReader(gZipIs, "utf-8"); 
        BufferedReader br = new java.io.BufferedReader(isr);

        String tempbf;
        while ((tempbf = br.readLine()) != null) {
               sb.append(tempbf);
               sb.append("\r\n");
        }
        isr.close();
        gZipIs.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return sb.toString();
}
           
  • removeSpecialChar
public static String removeSpecialChar(String str) {
      String regEx="[ `[email protected]#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。,、?]";
      Pattern   p   =   Pattern.compile(regEx);     
      Matcher   m   =   p.matcher(str);
      return m.replaceAll("").trim();
}
           
  • writeRequestEntityIntoFile
public static boolean writeRequestEntityIntoFile(final String url, final String absFileName) {
    HttpClient httpClient = new DefaultHttpClient();
    final HttpGet httpGet = new HttpGet(url);
    return writeRequestEntityIntoFile(httpGet, httpClient, url, absFileName);
    }
           
  • writeRequestEntityIntoFile
private static boolean writeRequestEntityIntoFile(final HttpRequestBase requestBase,
        final HttpClient httpClient, final String url, final String absFileName) {
    try {
        requestBase.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, WAIT_TIME_OUT);

        final HttpResponse response = httpClient.execute(requestBase);
        if (response.getStatusLine().getStatusCode() != HttpStatus.SC_OK) {
            return false;
        }

        HttpEntity entity = response.getEntity();
        if (entity != null && entity.getContentEncoding() != null) {
            System.out.println("contentType : " + entity.getContentType());
            if ("gzip".equalsIgnoreCase(entity.getContentEncoding().getValue())) {
                System.out.println("contentEncoding : gzip");
                return FileUtil.readInputStreamAndWriteToFile(absFileName, new GzipDecompressingEntity(entity).getContent());
            } else if ("deflate".equalsIgnoreCase(entity.getContentEncoding().getValue())) {
                System.out.println("contentEncoding : deflate");
                return FileUtil.readInputStreamAndWriteToFile(absFileName, new DeflateDecompressingEntity(entity).getContent());
            }
        } else {
            System.out.println("entity == null || entity.getContentEncoding() == null");
        }

    } catch (final ClientProtocolException e) {
        e.printStackTrace();
    } catch (final IOException e) {
        e.printStackTrace();
    } finally {
        requestBase.abort();
    }
    return false;
}
           

后记

篇幅所限,这里没有列出全部代码,但关键代码已经全部列出,其他的一些小方法自己去实现就可以了。