方法一、自己写代码
1.pom依赖
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>com.github.jiayuhan-it</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3JYHTEST</version>
</dependency>
2.代码实现
import org.apache.parquet.example.data.Group;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.Type;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import org.apache.hadoop.fs.Path;
import java.util.HashMap;
import java.util.Map;
public class ParquetTest {
public static void main(String[] args) throws IOException {
//1.创建GroupReadSupport
GroupReadSupport readSupport = new GroupReadSupport();
//2.使用ParquetReader.builder方法创建reader对象。
ParquetReader.Builder<Group> reader = ParquetReader.builder(readSupport, new Path("D:\\part-00001-de10a7bd-e360-4c02-b4f4-1c30c6b91be3-c000.snappy.parquet"));
ParquetReader<Group> build = reader.build();
//3.读取一个数据,并输出group的schema列名
Group group = null;
if (((group = build.read()) != null)) {
int fieldCount = group.getType().getFieldCount();
for (int field = 0; field < fieldCount; field++) {
Type fieldType = group.getType().getType(field);
String fieldName = fieldType.getName();
System.out.println(fieldName);
}
}
}
方法二、使用社区工具
1.下载社区工具
parquet-tools-1.6.0rc3-SNAPSHOT.jar
git project: https://github.com/apache/parquet-mr/tree/master/parquet-tools?spm=5176.doc52798.2.6.H3s2kL
2.查看schema信息
(我在windows下执行的,jar包和parquet文件都在D盘)
java -jar D:\parquet-tools-1.6.0rc3-SNAPSHOT.jar schema -d D:\part-00001-de10a7bd-e360-4c02-b4f4-1c30c6b91be3-c000.snappy.parquet
结果:
D:\>java -jar D:\parquet-tools-1.6.0rc3-SNAPSHOT.jar schema -d D:\part-00001-de10a7bd-e360-4c02-b4f4-1c30c6b91be3-c000.snappy.parquet
message spark_schema {
required binary range_partition_date (UTF8);
required int64 Id;
required int64 UserId;
optional binary dw_cre_date (UTF8);
optional binary dw_upd_date (UTF8);
optional int32 kudu_is_active (INT_8);
optional binary FlowId (UTF8);
optional binary Type (UTF8);
optional binary Content (UTF8);
optional binary InsertTime (UTF8);
optional binary UpdateTime (UTF8);
optional int32 IsActive;
optional int64 listingId;
optional int64 bizId;
optional int64 dingId;
optional int64 zuId;
}
。。。。。
3.查看数据
(n后指定数据条数,1表示只取一条)
D:\>java -jar D:\parquet-tools-1.6.0rc3-SNAPSHOT.jar head -n 1 D:\part-00001-de10a7bd-e360-4c02-b4f4-1c30c6b91be3-c000.snappy.parquet
结果:
D:\>java -jar D:\parquet-tools-1.6.0rc3-SNAPSHOT.jar head -n 1 D:\part-00001-de10a7bd-e360-4c02-b4f4-1c30c6b91be3-c000.snappy.parquet
range_partition_date = 2021-01-07
Id = 18716586
dw_cre_date = 2021-01-06 23:59:20
dw_upd_date = 2021-01-06 23:59:20
kudu_is_active = 1
FlowId = [email protected][email protected]@[email protected]@0
Type = 30001026_drools_input
Content = <batch-execution lookup="defaultStatelessKieSession">
<insert entry-point="DEFAULT" out-identifier="LoyalCustomerAuditSimplified-output" return-object="true">
<ppdaidrools.loyalcustomerauditsimplified.LoyalCustomerAudit>
<blackListID>10</blackListID>
<daeType>-1</daeType>
<testVipList>0</testVipList>
<daeDuedate>1900-01-01 00:00:00 +0800</daeDuedate>
<xinkeEdu>-1</xinkeEdu>
<xinkeGeieTime>1900-01-01 00:00:00 +0800</xinkeGeieTime>
<bizid>11001</bizid>
<zuid>12</zuid>
<dingid>102</dingid>
<listtype>21</listtype>
<sublisttype>18</sublisttype>
<repaymentSourceType>1</repaymentSourceType>
<tongDunAll6mId>21</tongDunAll6mId>
<tongDunAll3mId>7</tongDunAll3mId>
<tongDunP2PNetLoan1mId>0</tongDunP2PNetLoan1mId>
<tongDunSmallLoan1mId>1</tongDunSmallLoan1mId>
。。。。。。
方法二参考自:
参考:https://www.cnblogs.com/yako/p/7889341.html