我写了一篇关于阅读镶木地板文件的博客文章(
http://www.jofre.de/?p=1459),并提出了以下解决方案,甚至能够读取INT96字段.
您需要以下maven依赖项:
org.apache.parquet
parquet-hadoop
1.9.0
org.apache.hadoop
hadoop-common
2.7.0
代码基本上是:
public class Main {
private static Path path = new Path("file:\\C:\\Users\\file.snappy.parquet");
private static void printGroup(Group g) {
int fieldCount = g.getType().getFieldCount();
for (int field = 0; field < fieldCount; field++) {
int valueCount = g.getFieldRepetitionCount(field);
Type fieldType = g.getType().getType(field);
String fieldName = fieldType.getName();
for (int index = 0; index < valueCount; index++) {
if (fieldType.isPrimitive()) {
System.out.println(fieldName + " " + g.getValueToString(field, index));
}
}
}
}
public static void main(String[] args) throws IllegalArgumentException {
Configuration conf = new Configuration();
try {
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
PageReadStore pages = null;
try {
while (null != (pages = r.readNextRowGroup())) {
final long rows = pages.getRowCount();
System.out.println("Number of rows: " + rows);
final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
final RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
for (int i = 0; i < rows; i++) {
final Group g = recordReader.read();
printGroup(g);
// TODO Compare to System.out.println(g);
}
}
} finally {
r.close();
}
} catch (IOException e) {
System.out.println("Error reading parquet file.");
e.printStackTrace();
}
}
}