本文永久位址:https://my.oschina.net/bysu/blog/1528130
相關jar下載下傳位址:
http://mirror.bit.edu.cn/apache/poi/dev/bin/poi-bin-3.17-beta1-20170701.tar.gz
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
public class ReadFromDoc {
public static void main(String[] args) {
System.out.println(readWord("D:\\workspace\\java\\大學英語.doc"));
}
public static String readWord(String filePath) {
String text = "";
File file = new File(filePath);
// 2003
if (file.getName().endsWith(".doc")) {
try {
FileInputStream stream = new FileInputStream(file);
WordExtractor word = new WordExtractor(stream);
text = word.getText();
// 去掉word文檔中的多個換行
text = text.replaceAll("(\\r\\n){2,}", "\r\n");
text = text.replaceAll("(\\n){2,}", "\n");
stream.close();
} catch (Exception e) {
e.printStackTrace();
}
} else if (file.getName().endsWith(".docx")) { // 2007
try {
OPCPackage oPCPackage = POIXMLDocument.openPackage(filePath);
XWPFDocument xwpf = new XWPFDocument(oPCPackage);
POIXMLTextExtractor ex = new XWPFWordExtractor(xwpf);
text = ex.getText();
// 去掉word文檔中的多個換行
text = text.replaceAll("(\\r\\n){2,}", "\r\n");
text = text.replaceAll("(\\n){2,}", "\n");
System.out.println("ok");
} catch (Exception e) {
e.printStackTrace();
}
}
return text;
}
}