单词计数-MapReduceJob

pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

	<modelVersion>4.0.0</modelVersion>

	<groupId>com.zuoyan</groupId>

	<artifactId>hadoop</artifactId>

	<version>0.0.1-SNAPSHOT</version>

	<packaging>jar</packaging>

	<name>hadoop</name>

	<url>http://maven.apache.org</url>

	<properties>

		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

	</properties>

	<dependencies>

		<dependency>

			<groupId>junit</groupId>

			<artifactId>junit</artifactId>

			<version>3.8.1</version>

		</dependency>

		<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->

		<dependency>

			<groupId>org.apache.hadoop</groupId>

			<artifactId>hadoop-client</artifactId>

			<version>3.0.0</version>

		</dependency>

		<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->

		<dependency>

		    <groupId>com.janeluo</groupId>

		    <artifactId>ikanalyzer</artifactId>

		    <version>2012_u6</version>

		</dependency>

	</dependencies>

	<build>

		<plugins>

			<plugin>

				<artifactId>maven-assembly-plugin</artifactId>

				<configuration>

					<appendAssemblyId>false</appendAssemblyId>

					<descriptorRefs>

						<descriptorRef>jar-with-dependencies</descriptorRef>

					</descriptorRefs>

					<archive>

						<manifest>

							<!-- 此处指定main方法入口的class -->

							<mainClass>com.zuoyan.hadoop.FirstMapReduceJob</mainClass>

<!-- 							<mainClass>com.geotmt.hadoop.hdfs.FirstMapReduceJob</mainClass> -->

						</manifest>

					</archive>

				</configuration>

				<executions>

					<execution>

						<id>make-assembly</id>

						<phase>package</phase>

						<goals>

							<goal>assembly</goal>

						</goals>

					</execution>

				</executions>

			</plugin>

			<plugin>

				<groupId>org.apache.maven.plugins</groupId>

				<artifactId>maven-compiler-plugin</artifactId>

				<version>3.6.2</version>

				<configuration>

					<source>1.8</source>

					<target>1.8</target>

					<encoding>UTF-8</encoding>

				</configuration>

			</plugin>

		</plugins>

	</build>

</project>

单词计数-实现

package com.zuoyan.hadoop;

import java.io.ByteArrayInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.Reader;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

import org.wltea.analyzer.core.IKSegmenter;

import org.wltea.analyzer.core.Lexeme;

/**

 * 单词计数

 *

 */

public class FirstMapReduceJob {

    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{

        private final static IntWritable one = new IntWritable(1);

        private Text word = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

            /*

             * 默认英文分词

             *

            StringTokenizer itr = new StringTokenizer(value.toString());

            while (itr.hasMoreTokens()) {

                word.set(itr.nextToken());

                context.write(word, one);

            }

            */

        	/*

        	 * 中文分词-使用IK分词器分词

        	 */

            byte[] bytes = value.getBytes();

            InputStream inputStream = new ByteArrayInputStream(bytes);

            Reader reader = new InputStreamReader(inputStream);

            IKSegmenter iKSegmenter = new IKSegmenter(reader,true);

            Lexeme t;

            while((t=iKSegmenter.next()) != null){

            	context.write(new Text(t.getLexemeText()), new IntWritable(1));

            }

            //方案二，获取文件信息

//            context.getInputSplit().getLocationInfo();

        }

    }

    public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,Context context ) throws IOException, InterruptedException {

            int sum = 0;

            for (IntWritable val : values) {

                sum += val.get();

            }

            result.set(sum);

            context.write(key, result);

        }

    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

        if (otherArgs.length != 2) {

            System.err.println("Usage: wordcount <in> <out>");

            System.exit(2);

        }

        Job job = new Job(conf, "word count");

        job.setJarByClass(FirstMapReduceJob.class);

        job.setMapperClass(TokenizerMapper.class);

        job.setCombinerClass(IntSumReducer.class);

        job.setReducerClass(IntSumReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

        System.exit(job.waitForCompletion(true) ? 0 : 1);

    }

}

巴特西

单词计数-MapReduceJob

最新文章

热门文章