需求:将一個結構化不太好的原始的大json檔案,轉為CSV檔案,有{{}}嵌套也有[[ ]]嵌套。
思路:
1 .肯定不能使用原始的LIst Map…
2. 盡量減少對line 的周遊。
3. 可适當采用中間檔案。
package convert;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import joinery.DataFrame;
import java.io.*;
import java.math.BigDecimal;
/**
* @author zijian Wang
* @date 2021/8/10 16:04
* @VERSION 1.0
*/
public class convert2traindata {
private static String filePath_origin ;
private static String outPutPath;
//intermediateFile
private static String filePath_model = "E:\\change\\model_train.csv";
private static String filePath_model_index = "E:\\change\\model_train_index.csv";
private static String filePath_model_transpose = "E:\\change\\transpose.csv";
private static String filePath_model_res;
//window
//private static String delimiter="\\";
//linux
private static String delimiter="/";
public static void main(String[] args) throws IOException {
//加載參數1.輸入路徑和檔案名
//加載參數2 輸出的路徑,名稱和源輸入檔案一樣。
//linux
filePath_origin=args[0];
outPutPath =args[1];
//window
/* filePath_origin="E:\\change\\origin.json";
outPutPath ="E:\\change\\";*/
String outPutFileName= filePath_origin.substring(filePath_origin.lastIndexOf(delimiter)+1,filePath_origin.lastIndexOf("."));
//生成輸出路徑
filePath_model=outPutPath+outPutFileName+"_model.csv";
filePath_model_index=outPutPath+outPutFileName+"_index.csv";
filePath_model_transpose=outPutPath+outPutFileName+"_transpose.csv";
filePath_model_res=outPutPath+outPutFileName+".csv";
long startTime = System.currentTimeMillis();
convert2traindata();
mergeFile(filePath_model, filePath_model_index);
transpose(filePath_model_index);
printResFile(filePath_model_transpose, filePath_model_res);
long endTime = System.currentTimeMillis();
System.out.println("程式運作時間: " + (endTime - startTime) + "ms");
}
/**
*使用jsonParser 提取資料并寫入中間檔案
*/
public static void convert2traindata() throws IOException {
JsonFactory jasonFactory = new JsonFactory();
JsonParser jsonParser = null;
PrintWriter writer_model = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model)), "UTF-8"));
PrintWriter writer_index = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model_index)), "UTF-8"));
PrintWriter writer_res = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model_res)), "UTF-8"));
jsonParser = jasonFactory.createJsonParser(new File(filePath_origin));
jsonParser.nextToken();
while (jsonParser.nextToken() != JsonToken.NOT_AVAILABLE) {
String fieldname = jsonParser.getCurrentName();
if (jsonParser.nextToken() == null || fieldname == null) {
jsonParser.close();
writer_model.close();
break;
}
int filedIndex = 0;
//讀取stdNames 要求寫入的字段,直接放入結果檔案中
if (fieldname != null && fieldname.equals("stdNames")) {
writer_res.append("ts").append(",");
while (jsonParser.currentToken() != JsonToken.END_ARRAY) {
if (filedIndex == 16) {
writer_res.append(jsonParser.getText());
} else {
writer_res.append(jsonParser.getText()).append(",");
}
filedIndex++;
jsonParser.nextToken();
}
writer_res.write("\n");
writer_res.close();
}
//讀取times資料
int transposeIndex = 0;
if (fieldname != null && fieldname.equals("times")) {
jsonParser.nextToken();
while (jsonParser.currentToken() != JsonToken.END_ARRAY) {
transposeIndex++;
writer_model.append(new BigDecimal(jsonParser.getText()).toPlainString()).append(",");
jsonParser.nextToken();
}
//生成索引檔案
for (int i = 0; i < transposeIndex; i++) {
writer_index.append(String.valueOf(i)).append(",");
}
writer_index.append("\n");
writer_index.close();
}
//讀取dataMatrix資料
if (fieldname != null && fieldname.equals("dataMatrix")) {
writer_model.append("\n");
while (jsonParser.currentToken() != JsonToken.END_OBJECT) {
if (jsonParser.getText() != "[") {
if (jsonParser.getText() == "]") {
writer_model.append("\n");
} else {
writer_model.append(jsonParser.getText()).append(",");
}
}
jsonParser.nextToken();
}
writer_model.close();
}
}
jsonParser.close();
}
/**
* 合并檔案和索引
*
* @param file1
* @param file2
* @throws IOException
*/
public static void mergeFile(String file1, String file2) throws IOException {
BufferedReader inputStream = null;
BufferedWriter outputStream = null;
inputStream = new BufferedReader(new FileReader(file1));
FileWriter filewriter = new FileWriter(new File(file2), true);
outputStream = new BufferedWriter(filewriter);
String count;
while ((count = inputStream.readLine()) != null) {
if (count != "" && count.length() > 17) {
outputStream.write(count);
outputStream.write("\n");
}
}
outputStream.flush();
outputStream.close();
inputStream.close();
new File(file1).delete();
}
/**
* 矩陣轉置
*
* @param filePath
* @throws IOException
*/
public static void transpose(String filePath) throws IOException {
DataFrame df = null;
df = DataFrame.readCsv(filePath,",",DataFrame.NumberDefault.LONG_DEFAULT);
DataFrame<String> df3 = df.transpose();
System.out.println(df3.length());
for (int i=0;i<df3.length()-1;i++){
String value=new BigDecimal(String.valueOf(df3.get(i,0))).toPlainString();
df3.set(i,0,value);
}
df3.writeCsv(filePath_model_transpose);
new File(filePath).delete();
}
/**
* 生成結果檔案
*
* @param file1
* @param file2
* @throws IOException
*/
public static void printResFile(String file1, String file2) throws IOException {
BufferedReader inputStream = null;
BufferedWriter outputStream = null;
FileWriter filewriter = null;
inputStream = new BufferedReader(new FileReader(file1));
filewriter = new FileWriter(new File(file2), true);
outputStream = new BufferedWriter(filewriter);
String count;
int lineCode = 0;
while ((count = inputStream.readLine()) != null) {
if (count != "" && count.length() > 17 && lineCode > 0) {
outputStream.write(count);
outputStream.write("\n");
}
lineCode++;
}
outputStream.flush();
outputStream.close();
inputStream.close();
new File(file1).delete();
}
}
測試後3000行的json需要0.3S左右。
3w行的大約2.8S執行完。效率應對基本需求完全夠用~