天天看點

htmlunit擷取指定元素

htmlunit 提供了豐富的api來擷取指定元素 jsoup有的  htmlunit也有

package com.gcx.htmlunit;

import java.io.IOException;
import java.net.MalformedURLException;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlListItem;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class Search {
	public static void main(String[] args) {
		//執行個體化web用戶端 模拟指定浏覽器
		WebClient wc=new WebClient(BrowserVersion.FIREFOX_45);
		try {
			//解析擷取頁面
			HtmlPage page = wc.getPage("http://www.bjsxt.com");
			//查找指定id 的 html dom元素
			HtmlDivision hd  = page.getHtmlElementById("LRdiv0");
			System.out.println(hd.asXml());
			
			System.out.println("-------------------------------");
			//根據tag名稱查找所有tag
			DomNodeList<DomElement> elementsByTagName = page.getElementsByTagName("a");
			for(int i=0;i<elementsByTagName.getLength();i++){
				DomElement domElement = elementsByTagName.get(i);
				System.out.println(domElement.asXml());
			}
			System.out.println("-------------------------------");
			//擷取指定xpath
			HtmlListItem item= (HtmlListItem) page.getByXPath("//div[@id='navMenu'][1]/ul/li").get(0);
			System.out.println(item.asXml());
		} catch (FailingHttpStatusCodeException e) {
			e.printStackTrace();
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
			wc.close();//關閉用戶端,釋放資源
		}
		
	}
}