MapReduce读写orc文件

博客地址:http://www.fanlegefan.com

文章地址:http://www.fanlegefan.com/archives/mapreduceorc/

MapReduce 读取ORC格式文件

创建orc格式hive表

查看hive表结构

show create table test_orc
CREATE TABLE `test_orc`(
  `name` string, 
  `age` int)
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde' 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
LOCATION
  'hdfs://localhost:9000/user/work/warehouse/test_orc'
TBLPROPERTIES (
  'transient_lastDdlTime'='1502868725')

插入测试数据

jar依赖

<dependency>
    <groupId>org.apache.orc</groupId>
    <artifactId>orc-core</artifactId>
    <version>1.2.3</version>
</dependency>
<dependency>
    <groupId>org.apache.orc</groupId>
    <artifactId>orc-mapreduce</artifactId>
    <version>1.1.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>2.6.0</version>
</dependency>

MR读取ORC格式文件代码如下

package com.fan.hadoop.orc;

import com.fan.hadoop.parquet.thrift.ParquetThriftWriterMR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcInputFormat;
import java.io.IOException;


public class OrcReaderMR {

    public static class OrcMap extends Mapper<NullWritable,OrcStruct,Text,IntWritable> {

        // Assume the ORC file has type: struct<s:string,i:int>
        public void map(NullWritable key, OrcStruct value,
                        Context output) throws IOException, InterruptedException {
            // take the first field as the key and the second field as the value
            output.write((Text) value.getFieldValue(),
                    (IntWritable) value.getFieldValue());
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);
        job.setJarByClass(ParquetThriftWriterMR.class);
        job.setJobName("parquetthrfit");

        String in = "hdfs://localhost:9000/user/work/warehouse/test_orc";
        String out = "hdfs://localhost:9000/test/orc";

        job.setMapperClass(OrcMap.class);
        OrcInputFormat.addInputPath(job, new Path(in));
        job.setInputFormatClass(OrcInputFormat.class);
        job.setNumReduceTasks();

        job.setOutputFormatClass(TextOutputFormat.class);

        FileOutputFormat.setOutputPath(job, new Path(out));


        job.waitForCompletion(true);
    }

}

查看生成文件

hadoop dfs -cat /test/orc/part-m-

kafka   
tensflow        
hadoop  
hbase   
flume   
kafka   
kafka   
flume   
tensflow        
flume

MR写ORC格式文件

package com.fan.hadoop.orc;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.orc.OrcConf;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcOutputFormat;
import java.io.IOException;

public class OrcWriterMR {

    public static class OrcWriterMapper
            extends Mapper<LongWritable,Text,NullWritable,OrcStruct> {


        private TypeDescription schema =
                TypeDescription.fromString("struct<name:string,age:int>");

        private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);


        private final NullWritable nada = NullWritable.get();
        private Text name = new Text();
        private IntWritable age = new IntWritable();

        public void map(LongWritable key, Text value,
                           Context output
        ) throws IOException, InterruptedException {

            if(!"".equals(value.toString())){
                String[] arr = value.toString().split("\t");
                name.set(arr[]);
                age.set(Integer.valueOf(arr[]));
                pair.setFieldValue(, name);
                pair.setFieldValue(,age);
                output.write(nada, pair);
            }

        }
    }



    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf,"struct<name:string,age:int>");

        Job job = Job.getInstance(conf);
        job.setJarByClass(OrcWriterMR.class);
        job.setJobName("OrcWriterMR");

        String in = "hdfs://localhost:9000/user/work/warehouse/test/ddd.txt";
        String out = "hdfs://localhost:9000/test/orc2";


        job.setMapperClass(OrcWriterMapper.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setNumReduceTasks();

        job.setOutputFormatClass(OrcOutputFormat.class);
        FileInputFormat.addInputPath(job, new Path(in));

        OrcOutputFormat.setOutputPath(job, new Path(out));


        job.waitForCompletion(true);
    }
}

查看生成文件

#### 生成orc文件
 hadoop dfs -ls /test/orc2

-rw-r--r--   3 work supergroup          0 2017-08-16 17:45 /test/orc2/_SUCCESS
-rw-r--r--   3 work supergroup    6314874 2017-08-16 17:45 /test/orc2/part-m-00000.orc

将数据放到hive表路径下

在hive表中查看数据

hive> select * from test_orc limit ;
OK
kafka   
tensflow        
hadoop  
hbase   
flume   
kafka   
kafka   
flume   
tensflow        
flume   
flume   
tensflow        
flume   
Time taken:  seconds, Fetched:  row(s)

MapReduce读写orc文件

MapReduce 读取ORC格式文件

创建orc格式hive表

查看hive表结构

插入测试数据

jar依赖

MR读取ORC格式文件代码如下

查看生成文件

MR写ORC格式文件

查看生成文件

将数据放到hive表路径下

在hive表中查看数据

继续阅读

MapReduce运行Wordcount时一直卡在INFO mapreduce.Job: Running job，web查看一直处于accepted阶段

ubuntu hadoop2.6.1，terminal下运行wordcount

MapReduce(一)：入门级程序wordcount及其分析

HiveQl语句应用实例：WordCount具体步骤如下：

hadoop操作遇到的问题问题一：输出文件已存在

用mapreduce计算wordCount和手机流量统计程序运行过程WordCount统计手机流量统计

Hadoop之运行wordcount

jdk1.7+Eclipse+Maven3.5+Hadoop2.7.3构建hadoop项目

Eclipse运行WordCount（详细版）相关连接Eclipse运行WordCount

专家访谈：搜索开源力量：Lucene技术前景

hadoop 用MR实现join操作

Centos7 下 Hadoop 2.6.4 分布式集群环境搭建摘要集群准备安装JDK 安装 Hadoop 2.6.4 部署 slaver1-slaver4 启动 hadoop 集群成功了

MapReduce的几个企业级经典面试案例MapReduce的几个企业级经典面试案例

ubuntu14.04下安装hbse1.0.1.1

User Defined Hadoop DataType

Ambari介绍和架构原理