天天看点

java爬虫实现爬取百度风云榜Top10

最近在项目中遇到了java和python爬虫进行程序调用和接口对接的问题, 刚开始也是调试了好久才得出点门道. 

而后,自己也发现了爬虫的好玩之处,边想着用java来写个爬虫玩玩,虽说是个不起眼的demo,但还是想记录一下这个小爬虫,便于以后的查阅.

直接上代码:

1 import org.jsoup.Connection;
 2 import org.jsoup.Jsoup;
 3 import org.jsoup.nodes.Document;
 4 import org.jsoup.nodes.Element;
 5 import org.jsoup.select.Elements;
 6 import org.springframework.util.StringUtils;
 7 
 8 import java.io.IOException;
 9 import java.util.ArrayList;
10 import java.util.List;
11 
12 public class MySpider {
13     public static void main(String[] args) {
14         List<NewsEntity> list = new ArrayList<NewsEntity>();
15         Connection connect = Jsoup.connect("http://top.baidu.com/buzz?b=1&fr=tph_right");  //百度风云榜网址
16         connect.userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)");  //模拟火狐浏览器访问网页
17         try {
18             Document document = connect.get();      //建立连接,获取网页内容为文档对象
19             Element main = document.getElementById("main");  //获取需要爬去部位的根元素
20             Elements url = main.select("div[class=mainBody]").select("table[class=list-table]")
21                     .select("tbody").select("tr"); //css选择器
22             int i = 0;
23             for (Element element : url) {
24                 NewsEntity entity = new NewsEntity();
25                 String attr_url = element.select("td[class=keyword]").select("a[class=list-title]").attr("href");
26                 String text = element.select("td[class=keyword]").select("a[class=list-title]").text();
27                 String span = element.select("td[class=last").select("span").text();
28                 if (StringUtils.isEmpty(attr_url) || StringUtils.isEmpty(text) || StringUtils.isEmpty(span)) {
29                     continue;
30                 }
31                 entity.setTitle(text);
32                 entity.setUrl(attr_url);
33                 entity.setHots(span);
34                 i++;
35                 if (i > 10) {
36                     break;
37                 }
38                 list.add(entity);
39 
40             }
41             System.out.println(list.toString());
42             System.out.println(list.size());
43 
44         } catch (IOException e) {
45             e.printStackTrace();
46             System.out.println("网页元素发生改变或访问被禁止");
47         }
48     }
49 }      

简易封装:

1 /**
 2  * @author RYH
 3  * @description 封装新闻实体
 4  * @date 2019/2/26
 5  **/
 6 public class NewsEntity {
 7     private String title;
 8     private String url;
 9     private String hots;
10 
11     public String getTitle() {
12         return title;
13     }
14 
15     public void setTitle(String title) {
16         this.title = title;
17     }
18 
19     public String getUrl() {
20         return url;
21     }
22 
23     public void setUrl(String url) {
24         this.url = url;
25     }
26 
27     public String getHots() {
28         return hots;
29     }
30 
31     public void setHots(String hots) {
32         this.hots = hots;
33     }
34 
35     @Override
36     public String toString() {
37         return "NewsEntity{" +
38                 "title='" + title + '\'' +
39                 ", url='" + url + '\'' +
40                 ", hots=" + hots +
41                 '}';
42     }
43 }      

导入的包也只有jsoup包,功能还是很强大的

1         <dependency>
2             <groupId>org.springframework</groupId>
3             <artifactId>spring-jdbc</artifactId>
4             <version>5.1.4.RELEASE</version>
5         </dependency>          

控制台打印也一目了然, 做些简单的爬取还是很容易的