htmlunit 提供了丰富的api来获取指定元素 jsoup有的 htmlunit也有
package com.gcx.htmlunit;
import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlListItem;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Search {
public static void main(String[] args) {
//实例化web客户端 模拟指定浏览器
WebClient wc=new WebClient(BrowserVersion.FIREFOX_45);
try {
//解析获取页面
HtmlPage page = wc.getPage("http://www.bjsxt.com");
//查找指定id 的 html dom元素
HtmlDivision hd = page.getHtmlElementById("LRdiv0");
System.out.println(hd.asXml());
System.out.println("-------------------------------");
//根据tag名称查找所有tag
DomNodeList<DomElement> elementsByTagName = page.getElementsByTagName("a");
for(int i=0;i<elementsByTagName.getLength();i++){
DomElement domElement = elementsByTagName.get(i);
System.out.println(domElement.asXml());
}
System.out.println("-------------------------------");
//获取指定xpath
HtmlListItem item= (HtmlListItem) page.getByXPath("//div[@id='navMenu'][1]/ul/li").get(0);
System.out.println(item.asXml());
} catch (FailingHttpStatusCodeException e) {
e.printStackTrace();
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
wc.close();//关闭客户端,释放资源
}
}
}