【转自】自定义InputFormat、OutputFormat

转自：http://www.cnblogs.com/xiaolong1032/p/4529534.html

一:自定义实现InputFormat

*数据源来自于内存
*1.InputFormat是用于处理各种数据源的,下面是实现InputFormat,数据源是来自于内存.
*1.1 在程序的job.setInputFormatClass(MyselfmemoryInputFormat.class);
*1.2 实现InputFormat,extends InputFormat< , >,实现其中的两个方法,分别是getSplits(..),createRecordReader(..).
*1.3 getSplits(..)返回的是一个java.util.List<T>,List中的每个元素是InputSplit.每个InputSplit对应一个mappper任务.
*1.4 InputSplit是对原始海量数据源的划分,因为我们处理的是海量数据,不划分不行.InputSplit数据的大小完全是我们自己来定的.本例中是在内存中产生数据,然后封装到InputSplit.
*1.5 InputSplit封装的是hadoop数据类型,实现Writable接口.
*1.6 RecordReader读取每个InputSplit中的数据.解析成一个个<k,v>,供map处理.
*1.7 RecordReader有4个核心方法,分别是initalize(..).nextKeyValue(),getCurrentKey(),getCurrentValue().
*1.8 initalize重要性在于是拿到InputSplit和定义临时变量.
*1.9 nexKeyValue(..)该方法的每次调用,可以获得key和value值.
*1.10 当nextKeyValue(..)调用后,紧接着调用getCurrentKey(),getCurrentValue().
* mapper方法中的run方法调用.

public class MyselInputFormatApp {

        private static final String OUT_PATH = "hdfs://hadoop1:9000/out";// 输出路径,reduce作业输出的结果是一个目录

        public static void main(String[] args) {

            Configuration conf = new Configuration();// 配置对象

            try {

                FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH), conf);

                fileSystem.delete(new Path(OUT_PATH), true);

                Job job = new Job(conf, WordCountApp.class.getSimpleName());// jobName:作业名称

                job.setJarByClass(WordCountApp.class);

                job.setInputFormatClass(MyselfMemoryInputFormat.class);

                job.setMapperClass(MyMapper.class);// 指定自定义map类

                job.setMapOutputKeyClass(Text.class);// 指定map输出key的类型

                job.setMapOutputValueClass(LongWritable.class);// 指定map输出value的类型

                job.setReducerClass(MyReducer.class);// 指定自定义Reduce类

                job.setOutputKeyClass(Text.class);// 设置Reduce输出key的类型

                job.setOutputValueClass(LongWritable.class);// 设置Reduce输出的value类型

                FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));// Reduce输出完之后,就会产生一个最终的输出,指定最终输出的位置

                job.waitForCompletion(true);// 提交给jobTracker并等待结束

            } catch (Exception e) {

                e.printStackTrace();

            }

        }

        public static class MyMapper extends

                Mapper<NullWritable, Text, Text, LongWritable> {

            @Override

            protected void map(NullWritable key, Text value, Context context)

                    throws IOException, InterruptedException {

                String line = value.toString();

                String[] splited = line.split("\t");

                for (String word : splited) {

                    context.write(new Text(word), new LongWritable(1));// 把每个单词出现的次数1写出去.

                }

            }

        }

        public static class MyReducer extends

                Reducer<Text, LongWritable, Text, LongWritable> {

            @Override

            protected void reduce(Text key, Iterable<LongWritable> values,

                    Context context) throws IOException, InterruptedException {

                long count = 0L;

                for (LongWritable times : values) {

                    count += times.get();

                }

                context.write(key, new LongWritable(count));

            }

        }

        /**

         * 从内存中产生数据,然后解析成一个个的键值对

         *

         */

        public static class MyselfMemoryInputFormat extends InputFormat<NullWritable,Text>{

            @Override

            public List<InputSplit> getSplits(JobContext context)

                    throws IOException, InterruptedException {

                ArrayList<InputSplit> result = new ArrayList<InputSplit>();

                result.add(new MemoryInputSplit());

                result.add(new MemoryInputSplit());

                result.add(new MemoryInputSplit());

                return result;

            }

            @Override

            public RecordReader<NullWritable, Text> createRecordReader(

                    InputSplit split, TaskAttemptContext context)

                    throws IOException, InterruptedException {

                return new MemoryRecordReader();

            }

        }

        public static class MemoryInputSplit extends InputSplit implements Writable{

            int SIZE = 10;

            //java中的数组在hadoop中不被支持,所以这里使用hadoop的数组

            //在hadoop中使用的是这种数据结构,不能使用java中的数组表示.

            ArrayWritable arrayWritable = new ArrayWritable(Text.class);

            /**

             * 先创建一个java数组类型,然后转化为hadoop的数据类型.

             * @throws FileNotFoundException

             */

            public MemoryInputSplit() throws FileNotFoundException {

                //一个inputSplit供一个map使用,map函数如果要被调用多次的话,意味着InputSplit必须解析出多个键值对

                Text[] array = new Text[SIZE];

                Random random = new Random();

                for(int i=0;i<SIZE;i++){

                    int nextInt = random.nextInt(999999);

                    Text text = new Text("Text"+nextInt);

                    array[i] = text ;

                }

//                FileInputStream fs = new FileInputStream(new File("\\etc\\profile"));//从文件中读取

//                将流中的数据解析出来放到数据结构中.

                arrayWritable.set(array);

            }

            @Override

            public long getLength() throws IOException, InterruptedException {

                return SIZE;

            }

            @Override

            public String[] getLocations() throws IOException,

                    InterruptedException {

                return new String[]{};

            }

            public ArrayWritable getValues() {

                return arrayWritable;

            }

            @Override

            public void write(DataOutput out) throws IOException {

                arrayWritable.write(out);

            }

            @Override

            public void readFields(DataInput in) throws IOException {

                arrayWritable.readFields(in);

            }

        }

        public static class MemoryRecordReader extends RecordReader<NullWritable, Text>{

            private Writable[] values = null ;

            private Text value = null ;

            private int i = 0;

            @Override

            public void initialize(InputSplit split, TaskAttemptContext context)

                    throws IOException, InterruptedException {

                MemoryInputSplit inputSplit = (MemoryInputSplit)split;

                ArrayWritable writables = inputSplit.getValues();

                this.values = writables.get();

                this.i = 0 ;

            }

            @Override

            public boolean nextKeyValue() throws IOException,

                    InterruptedException {

                if(i >= values.length){

                    return false ;

                }

                if(null == this.value){

                    value = new Text();

                }

                value.set((Text)values[i]);

                i++ ;

                return true;

            }

            @Override

            public NullWritable getCurrentKey() throws IOException,

                    InterruptedException {

                return NullWritable.get();

            }

            @Override

            public Text getCurrentValue() throws IOException,

                    InterruptedException {

                return value;

            }

            @Override

            public float getProgress() throws IOException, InterruptedException {

                // TODO Auto-generated method stub

                return 0;

            }

            /**

             * 程序结束的时候,关闭

             */

            @Override

            public void close() throws IOException {

            }

        }

    }

二:自定义实现OutputFormat

常见的输出类型:TextInputFormat:默认输出格式,key和value中间用tab隔开.
　　　　　　　　DBOutputFormat:写出到数据库的.
　　　　　　　　SequenceFileFormat:将key,value以Sequence格式输出的.
　　　　　　　　SequenceFileAsOutputFormat:SequenceFile以原始二进制的格式输出.
　　　　　　　　MapFileOutputFormat:将key和value写入MapFile中.由于MapFile中key是有序的,所以写入的时候必须保证记录是按key值顺序入的.
　　　　　　　　MultipleOutputFormat:多文件的一个输出.默认情况下一个reducer产生一个输出,但是有些时候我们想一个reducer产生多个输出,MultipleOutputFormat和MultipleOutputs就可以实现这个功能.
　　　　　　　　　　MultipleOutputFormat:可以自定义输出文件的名称.
　　　　　　　　　　继承MultipleOutputFormat 需要实现
　　　　　　　　　　　　getBaseRecordWriter():
　　　　　　　　　　　　generateFileNameForKeyvalue():根据键值确定文件名.

/**

 *自定义输出OutputFormat:用于处理各种输出目的地的.

 *1.OutputFormat需要写出的键值对是来自于Reducer类.是通过RecordWriter获得的.

 *2.RecordWriter(..)中write只有key和value,写到那里去哪?这要通过单独传入输出流来处理.write方法就是把k,v写入到outputStream中的.

 *3.RecordWriter类是位于OutputFormat中的.因此,我们自定义OutputFormat必须继承OutputFormat类.那么流对象就必须在getRecordWriter(..)中获得.

 */

public class MySelfOutputFormatApp {

    private static final String INPUT_PATH = "hdfs://hadoop1:9000/abd/hello";// 输入路径

    private static final String OUT_PATH = "hdfs://hadoop1:9000/out";// 输出路径,reduce作业输出的结果是一个目录

    private static final String OUT_FIE_NAME = "/abc";

    public static void main(String[] args) {

        Configuration conf = new Configuration();

        try {

            FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH), conf);

            fileSystem.delete(new Path(OUT_PATH), true);

            Job job = new Job(conf, WordCountApp.class.getSimpleName());

            job.setJarByClass(WordCountApp.class);

            FileInputFormat.setInputPaths(job, INPUT_PATH);

            job.setMapperClass(MyMapper.class);

            job.setMapOutputKeyClass(Text.class);

            job.setMapOutputValueClass(LongWritable.class);

            job.setReducerClass(MyReducer.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(LongWritable.class);

            job.setOutputFormatClass(MySelfTextOutputFormat.class);

            job.waitForCompletion(true);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    public static class MyMapper extends

            Mapper<LongWritable, Text, Text, LongWritable> {

        @Override

        protected void map(LongWritable key, Text value, Context context)

                throws IOException, InterruptedException {

            String line = value.toString();

            String[] splited = line.split("\t");

            for (String word : splited) {

                context.write(new Text(word), new LongWritable(1));// 把每个单词出现的次数1写出去.

            }

        }

    }

    public static class MyReducer extends

            Reducer<Text, LongWritable, Text, LongWritable> {

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values,

                Context context) throws IOException, InterruptedException {

            long count = 0L;

            for (LongWritable times : values) {

                count += times.get();

            }

            context.write(key, new LongWritable(count));

        }

    }

    /**

     *自定义输出类型

     */

    public static class MySelfTextOutputFormat  extends OutputFormat<Text,LongWritable>{

        FSDataOutputStream outputStream = null ;

        @Override

        public RecordWriter<Text, LongWritable> getRecordWriter(

                TaskAttemptContext context) throws IOException,

                InterruptedException {

            try {

                FileSystem fileSystem = FileSystem.get(new URI(MySelfOutputFormatApp.OUT_PATH), context.getConfiguration());

                //指定的是输出文件的路径

                String opath = MySelfOutputFormatApp.OUT_PATH+OUT_FIE_NAME;

                outputStream = fileSystem.create(new Path(opath));

            } catch (URISyntaxException e) {

                e.printStackTrace();

            }

            return new MySelfRecordWriter(outputStream);

        }

        @Override

        public void checkOutputSpecs(JobContext context) throws IOException,

                InterruptedException {

        }

        /**

         * OutputCommitter:在作业初始化的时候创建一些临时的输出目录,作业的输出目录,管理作业和任务的临时文件的.

         * 作业运行过程中,会产生很多的Task,Task在处理的时候也会产生很多的输出.也会创建这个输出目录.

         * 当我们的Task或者是作业都运行完成之后,输出目录由OutputCommitter给删了.所以程序在运行结束之后,我们根本看不见任何额外的输出.

         * 在程序运行中会产生很多的临时文件,临时文件全交给OutputCommitter处理,真正的输出是RecordWriter(..),我们只需要关注最后的输出就可以了.中间的临时文件就是程序运行时产生的.

         */

        @Override

        public OutputCommitter getOutputCommitter(TaskAttemptContext context)

                throws IOException, InterruptedException {

            //提交任务的输出,包括初始化路径,包括在作业完成的时候清理作业,删除临时目录,包括作业和任务的临时目录.

            //作业的输出路径应该是一个路径

            return new FileOutputCommitter(new Path(MySelfOutputFormatApp.OUT_PATH), context);

        }

    }

    public static class MySelfRecordWriter extends RecordWriter<Text, LongWritable>{

        FSDataOutputStream outputStream = null ;

        public MySelfRecordWriter(FSDataOutputStream outputStream) {

            this.outputStream = outputStream ;

        }

        @Override

        public void write(Text key, LongWritable value) throws IOException,

                InterruptedException {

            this.outputStream.writeBytes(key.toString());

            this.outputStream.writeBytes("\t");

            this.outputStream.writeLong(value.get());

        }

        @Override

        public void close(TaskAttemptContext context) throws IOException,

                InterruptedException {

            this.outputStream.close();

        }

    }

}

三：输出到多个文件目录中去

/**

 *输出到多个文件目录中去

 *使用旧api

 */

public class MyMultipleOutputFormatApp {

    private static final String INPUT_PATH = "hdfs://hadoop1:9000/abd";// 输入路径

    private static final String OUT_PATH = "hdfs://hadoop1:9000/out";// 输出路径,reduce作业输出的结果是一个目录

    public static void main(String[] args) {

        Configuration conf = new Configuration();

        try {

            FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH), conf);

            fileSystem.delete(new Path(OUT_PATH), true);

            JobConf job = new JobConf(conf, WordCountApp.class);

            job.setJarByClass(WordCountApp.class);

            FileInputFormat.setInputPaths(job, INPUT_PATH);

            job.setMapperClass(MyMapper.class);

            job.setMapOutputKeyClass(Text.class);

            job.setMapOutputValueClass(LongWritable.class);

            job.setReducerClass(MyReducer.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(LongWritable.class);

            job.setOutputFormat(MyMutipleFilesTextOutputFormat.class);

            FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));

            JobClient.runJob(job);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    public static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable> {

        @Override

        public void map(LongWritable key, Text value,

                OutputCollector<Text, LongWritable> output, Reporter reporter)

                throws IOException {

            String line = value.toString();

            String[] splited = line.split("\t");

            for (String word : splited) {

                output.collect(new Text(word), new LongWritable(1));

            }

        }

    }

    public static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable> {

        @Override

        public void reduce(Text key, Iterator<LongWritable> values,

                OutputCollector<Text, LongWritable> output, Reporter reporter)

                throws IOException {

            long count = 0L ;

            while(values.hasNext()){

                LongWritable times = values.next();

                count += times.get();

            }

            output.collect(key, new LongWritable(count));

        }

    }

    public static class MyMutipleFilesTextOutputFormat  extends MultipleOutputFormat<Text,LongWritable>{

        @Override

        protected org.apache.hadoop.mapred.RecordWriter<Text, LongWritable> getBaseRecordWriter(

                FileSystem fs, JobConf job, String name, Progressable progress)

                throws IOException {

            TextOutputFormat<Text, LongWritable> textOutputFormat = new TextOutputFormat<Text,LongWritable>();

            return textOutputFormat.getRecordWriter(fs, job, name, progress);

        }

        @Override

        protected String generateFileNameForKeyValue(Text key,

                LongWritable value, String name) {

            String keyString = key.toString();

            if(keyString.startsWith("hello")){

                return "hello";

            }else{

                //输出的文件名就是k3的值

                return keyString ;

            }

        }

    }

}

四：hadoop1.x api写单词计数的例子

/**

 *hadoop1.x

 *使用旧api写单词计数的例子

 */

public class WordCountApp {

    private static final String INPUT_PATH = "hdfs://hadoop1:9000/abd/hello";

    private static final String OUT_PATH = "hdfs://hadoop1:9000/out";

    public static void main(String[] args) {

        try {

            Configuration conf = new Configuration();

            FileSystem fs = FileSystem.get(conf);

            fs.delete(new Path(OUT_PATH),true);

            JobConf job = new JobConf(conf, WordCountApp.class);

            job.setJarByClass(WordCountApp.class);

            FileInputFormat.setInputPaths(job, INPUT_PATH);

            job.setMapperClass(MyMapper.class);

            job.setMapOutputKeyClass(Text.class);

            job.setMapOutputValueClass(LongWritable.class);

            job.setReducerClass(MyReducer.class);

            job.setOutputKeyClass(Text.class);

            job.setOutputValueClass(LongWritable.class);

            FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));

            JobClient.runJob(job);

        } catch (IOException e) {

            e.printStackTrace();

        }

    }

    public static class MyMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, LongWritable>{

        @Override

        public void map(LongWritable key, Text value,

                OutputCollector<Text, LongWritable> output, Reporter reporter)

                throws IOException {

            String line = value.toString();

            String[] splited = line.split("\t");

            for (String word : splited) {

                output.collect(new Text(word), new LongWritable(1L));

            }

        }

    }

    public static class MyReducer extends MapReduceBase implements Reducer<Text, LongWritable, Text, LongWritable>{

        @Override

        public void reduce(Text key, Iterator<LongWritable> values,

                OutputCollector<Text, LongWritable> output, Reporter reporter)

                throws IOException {

            long times = 0L ;

            while (values.hasNext()) {

                LongWritable longWritable = (LongWritable) values.next();

                times += longWritable.get();

            }

            output.collect(key, new LongWritable(times));

        }

    }

}

五：运行时接收命令行参数

/**

 *运行时会接收一些命令行的参数

 *Tool接口:支持命令行的参数

 *命令行执行:

 *    hadoop jar jar.jar cmd.WordCountApp hdfs://hadoop1:9000/abd/hello hdfs://hadoop1:9000/out

 */

public class WordCountApp extends Configured implements Tool {

    private static String INPUT_PATH = null;// 输入路径

    private static String OUT_PATH = null;// 输出路径,reduce作业输出的结果是一个目录

    @Override

    public int run(String[] args) throws Exception {

        INPUT_PATH = args[0];

        OUT_PATH = args[1];

        Configuration conf = getConf();// 配置对象

        try {

            FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH), conf);

            fileSystem.delete(new Path(OUT_PATH), true);

            Job job = new Job(conf, WordCountApp.class.getSimpleName());// jobName:作业名称

            job.setJarByClass(WordCountApp.class);

            FileInputFormat.setInputPaths(job, INPUT_PATH);// 指定数据的输入

            job.setMapperClass(MyMapper.class);// 指定自定义map类

            job.setMapOutputKeyClass(Text.class);// 指定map输出key的类型

            job.setMapOutputValueClass(LongWritable.class);// 指定map输出value的类型

            job.setReducerClass(MyReducer.class);// 指定自定义Reduce类

            job.setOutputKeyClass(Text.class);// 设置Reduce输出key的类型

            job.setOutputValueClass(LongWritable.class);// 设置Reduce输出的value类型

            FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));// Reduce输出完之后,就会产生一个最终的输出,指定最终输出的位置

            job.waitForCompletion(true);// 提交给jobTracker并等待结束

        } catch (Exception e) {

            e.printStackTrace();

        }

        return 0;

    }

    public static void main(String[] args) {

        try {

            ToolRunner.run(new Configuration(), new WordCountApp(),args);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    public static class MyMapper extends

            Mapper<LongWritable, Text, Text, LongWritable> {

        @Override

        protected void map(LongWritable key, Text value, Context context)

                throws IOException, InterruptedException {

            String line = value.toString();

            String[] splited = line.split("\t");

            for (String word : splited) {

                context.write(new Text(word), new LongWritable(1));// 把每个单词出现的次数1写出去.

            }

        }

    }

    public static class MyReducer extends

            Reducer<Text, LongWritable, Text, LongWritable> {

        @Override

        protected void reduce(Text key, Iterable<LongWritable> values,

                Context context) throws IOException, InterruptedException {

            long count = 0L;

            for (LongWritable times : values) {

                count += times.get();

            }

            context.write(key, new LongWritable(count));

        }

    }

}

巴特西