天天看點

java 讀取 doc_java讀取doc文檔

本文永久位址:https://my.oschina.net/bysu/blog/1528130

相關jar下載下傳位址:

http://mirror.bit.edu.cn/apache/poi/dev/bin/poi-bin-3.17-beta1-20170701.tar.gz

import java.io.File;

import java.io.FileInputStream;

import org.apache.poi.POIXMLDocument;

import org.apache.poi.POIXMLTextExtractor;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.poi.openxml4j.opc.OPCPackage;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

import org.apache.poi.xwpf.usermodel.XWPFDocument;

public class ReadFromDoc {

public static void main(String[] args) {

System.out.println(readWord("D:\\workspace\\java\\大學英語.doc"));

}

public static String readWord(String filePath) {

String text = "";

File file = new File(filePath);

// 2003

if (file.getName().endsWith(".doc")) {

try {

FileInputStream stream = new FileInputStream(file);

WordExtractor word = new WordExtractor(stream);

text = word.getText();

// 去掉word文檔中的多個換行

text = text.replaceAll("(\\r\\n){2,}", "\r\n");

text = text.replaceAll("(\\n){2,}", "\n");

stream.close();

} catch (Exception e) {

e.printStackTrace();

}

} else if (file.getName().endsWith(".docx")) { // 2007

try {

OPCPackage oPCPackage = POIXMLDocument.openPackage(filePath);

XWPFDocument xwpf = new XWPFDocument(oPCPackage);

POIXMLTextExtractor ex = new XWPFWordExtractor(xwpf);

text = ex.getText();

// 去掉word文檔中的多個換行

text = text.replaceAll("(\\r\\n){2,}", "\r\n");

text = text.replaceAll("(\\n){2,}", "\n");

System.out.println("ok");

} catch (Exception e) {

e.printStackTrace();

}

}

return text;

}

}