使用JsonParser流式解析json，并使用DataFrame進行矩陣轉置。

需求：将一個結構化不太好的原始的大json檔案,轉為CSV檔案，有{{}}嵌套也有[[ ]]嵌套。

思路：

1 .肯定不能使用原始的LIst Map…

2. 盡量減少對line 的周遊。

3. 可适當采用中間檔案。

package convert;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import joinery.DataFrame;


import java.io.*;
import java.math.BigDecimal;


/**
 * @author zijian Wang
 * @date 2021/8/10 16:04
 * @VERSION 1.0
 */
public class convert2traindata {

    private static String filePath_origin ;
    private  static  String outPutPath;

    //intermediateFile
    private static String filePath_model = "E:\\change\\model_train.csv";
    private static String filePath_model_index = "E:\\change\\model_train_index.csv";
    private static String filePath_model_transpose = "E:\\change\\transpose.csv";
    private static String filePath_model_res;
   //window
    //private static String delimiter="\\";
    //linux
    private static String delimiter="/";

    public static void main(String[] args) throws IOException {

        //加載參數1.輸入路徑和檔案名
        //加載參數2  輸出的路徑，名稱和源輸入檔案一樣。
        //linux
        filePath_origin=args[0];
        outPutPath =args[1];

        //window
/*        filePath_origin="E:\\change\\origin.json";
        outPutPath ="E:\\change\\";*/
        String outPutFileName= filePath_origin.substring(filePath_origin.lastIndexOf(delimiter)+1,filePath_origin.lastIndexOf("."));
        //生成輸出路徑
        filePath_model=outPutPath+outPutFileName+"_model.csv";
        filePath_model_index=outPutPath+outPutFileName+"_index.csv";
        filePath_model_transpose=outPutPath+outPutFileName+"_transpose.csv";
        filePath_model_res=outPutPath+outPutFileName+".csv";
      long startTime = System.currentTimeMillis();
        convert2traindata();
        mergeFile(filePath_model, filePath_model_index);
        transpose(filePath_model_index);
        printResFile(filePath_model_transpose, filePath_model_res);
        long endTime = System.currentTimeMillis();
        System.out.println("程式運作時間： " + (endTime - startTime) + "ms");
    }

    /**
     *使用jsonParser 提取資料并寫入中間檔案
     */
    public static void convert2traindata() throws IOException {

        JsonFactory jasonFactory = new JsonFactory();
        JsonParser jsonParser = null;

        PrintWriter writer_model = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model)), "UTF-8"));
        PrintWriter writer_index = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model_index)), "UTF-8"));
        PrintWriter writer_res = new PrintWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(filePath_model_res)), "UTF-8"));
        jsonParser = jasonFactory.createJsonParser(new File(filePath_origin));
        jsonParser.nextToken();
        while (jsonParser.nextToken() != JsonToken.NOT_AVAILABLE) {
            String fieldname = jsonParser.getCurrentName();
            if (jsonParser.nextToken() == null || fieldname == null) {
                jsonParser.close();
                writer_model.close();
                break;
            }
            int filedIndex = 0;
            //讀取stdNames 要求寫入的字段，直接放入結果檔案中
            if (fieldname != null && fieldname.equals("stdNames")) {
                writer_res.append("ts").append(",");
                while (jsonParser.currentToken() != JsonToken.END_ARRAY) {
                    if (filedIndex == 16) {
                        writer_res.append(jsonParser.getText());
                    } else {
                        writer_res.append(jsonParser.getText()).append(",");
                    }
                    filedIndex++;
                    jsonParser.nextToken();
                }
                writer_res.write("\n");
                writer_res.close();
            }
            //讀取times資料
            int transposeIndex = 0;
            if (fieldname != null && fieldname.equals("times")) {
                jsonParser.nextToken();
                while (jsonParser.currentToken() != JsonToken.END_ARRAY) {
                    transposeIndex++;
                    writer_model.append(new BigDecimal(jsonParser.getText()).toPlainString()).append(",");
                    jsonParser.nextToken();
                }
                //生成索引檔案
                for (int i = 0; i < transposeIndex; i++) {
                    writer_index.append(String.valueOf(i)).append(",");
                }
                writer_index.append("\n");
                writer_index.close();
            }
            //讀取dataMatrix資料
            if (fieldname != null && fieldname.equals("dataMatrix")) {
                writer_model.append("\n");
                while (jsonParser.currentToken() != JsonToken.END_OBJECT) {
                    if (jsonParser.getText() != "[") {
                        if (jsonParser.getText() == "]") {
                            writer_model.append("\n");
                        } else {
                            writer_model.append(jsonParser.getText()).append(",");
                        }
                    }
                    jsonParser.nextToken();
                }
                writer_model.close();
            }
        }
        jsonParser.close();
    }
    /**
     * 合并檔案和索引
     *
     * @param file1
     * @param file2
     * @throws IOException
     */
    public static void mergeFile(String file1, String file2) throws IOException {
        BufferedReader inputStream = null;
        BufferedWriter outputStream = null;
        inputStream = new BufferedReader(new FileReader(file1));
        FileWriter filewriter = new FileWriter(new File(file2), true);
        outputStream = new BufferedWriter(filewriter);
        String count;
        while ((count = inputStream.readLine()) != null) {
            if (count != "" && count.length() > 17) {
                outputStream.write(count);
                outputStream.write("\n");
            }
        }
        outputStream.flush();
        outputStream.close();
        inputStream.close();
        new File(file1).delete();
    }

    /**
     * 矩陣轉置
     *
     * @param filePath
     * @throws IOException
     */
    public static void transpose(String filePath) throws IOException {

        DataFrame df = null;
        df = DataFrame.readCsv(filePath,",",DataFrame.NumberDefault.LONG_DEFAULT);
        DataFrame<String> df3 = df.transpose();
        System.out.println(df3.length());
        for (int i=0;i<df3.length()-1;i++){
            String value=new BigDecimal(String.valueOf(df3.get(i,0))).toPlainString();
            df3.set(i,0,value);
        }
        df3.writeCsv(filePath_model_transpose);
        new File(filePath).delete();
    }

    /**
     * 生成結果檔案
     *
     * @param file1
     * @param file2
     * @throws IOException
     */
    public static void printResFile(String file1, String file2) throws IOException {

        BufferedReader inputStream = null;
        BufferedWriter outputStream = null;
        FileWriter filewriter = null;
        inputStream = new BufferedReader(new FileReader(file1));
        filewriter = new FileWriter(new File(file2), true);
        outputStream = new BufferedWriter(filewriter);
        String count;
        int lineCode = 0;
        while ((count = inputStream.readLine()) != null) {

            if (count != "" && count.length() > 17 && lineCode > 0) {
                outputStream.write(count);
                outputStream.write("\n");
            }
            lineCode++;
        }
        outputStream.flush();
        outputStream.close();
        inputStream.close();
        new File(file1).delete();
    }
}

測試後3000行的json需要0.3S左右。

3w行的大約2.8S執行完。效率應對基本需求完全夠用~

使用JsonParser流式解析json，并使用DataFrame進行矩陣轉置。

使用JsonParser流式解析json，并使用DataFrame進行矩陣轉置。

繼續閱讀

【51CTO學院三周年】自學路上的伴侶

線上教育巨頭多鄰國Duolingo入華一周年，中國市場馬力全開

【分類算法】什麼是分類算法定義分類與聚類分類過程方法

申請評分模型拒絕推斷（RI）方法申請評分模型拒絕推斷（RI）方法

Sql優化一：sql語句優化

Nacos 2.0 更新前後性能對比壓測

尚矽谷—韓順平—圖解 Java設計模式（結構型）（55～）

Storm編譯打包過程中遇到的一些問題及解決方法

MapReduce的幾個企業級經典面試案例MapReduce的幾個企業級經典面試案例

9.spark Core 進階2--Cashe

淺談企業活動中進行資料分析的重要性

Ambari介紹和架構原理

spark/scala關于【資源檔案】加載方法概述外部檔案加載方案測試資源檔案打包入jar包中小結

NOSQL安全攻擊

win10本地scala和spark安裝安裝scala安裝spark

scala (3) Function 和 Method