pom文件

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion> <groupId>com.zuoyan</groupId>
<artifactId>hadoop</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging> <name>hadoop</name>
<url>http://maven.apache.org</url> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties> <dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<!-- 此处指定main方法入口的class -->
<mainClass>com.zuoyan.hadoop.FirstMapReduceJob</mainClass>
<!-- <mainClass>com.geotmt.hadoop.hdfs.FirstMapReduceJob</mainClass> -->
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
</project>

  

单词计数-实现

package com.zuoyan.hadoop;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; /**
* 单词计数
*
*/
public class FirstMapReduceJob { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{ private final static IntWritable one = new IntWritable(1);
private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
/*
* 默认英文分词
*
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
*/
/*
* 中文分词-使用IK分词器分词
*/
byte[] bytes = value.getBytes();
InputStream inputStream = new ByteArrayInputStream(bytes);
Reader reader = new InputStreamReader(inputStream);
IKSegmenter iKSegmenter = new IKSegmenter(reader,true);
Lexeme t;
while((t=iKSegmenter.next()) != null){
context.write(new Text(t.getLexemeText()), new IntWritable(1));
} //方案二,获取文件信息
// context.getInputSplit().getLocationInfo(); }
} public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values,Context context ) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
} public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(FirstMapReduceJob.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}

  

最新文章

  1. 2016huasacm暑假集训训练五 E - What Is Your Grade?
  2. 大数据 &gt; 数据平台方案评估
  3. cacti监控juniper路由器
  4. php工作笔记5-css定位
  5. Designing a CSS based template
  6. UnWind Segue
  7. 【转】maven命令背后是如何工作的
  8. face mask in opencv
  9. hql抓取要注意的点
  10. centos 6.0中文输入法的设置
  11. Spring-----代码中使用注入的Properties配置属性
  12. json格式 (JavaScipt Object Notation)
  13. HTML5离线存储的工作原理和使用
  14. update_engine-DownloadAction(一)
  15. Centos7.5基于MySQL5.7的 InnoDB Cluster 多节点高可用集群环境部署记录
  16. PAT 乙级 1026 程序运行时间(15) C++版
  17. ORA-01461的解决过程~~
  18. 常用的Array相关的属性和方法
  19. IBM研究院找到度量安全性方法:容器与虚拟机,谁更安全?
  20. 『cs231n』作业3问题4选讲_图像梯度应用强化

热门文章

  1. Drone - 安装,搭配 GitLab 下的配置和使用
  2. FCKEditor添加字体
  3. poj2010 Moo University - Financial Aid 优先队列
  4. window.location.href后携带参数
  5. Shell生成随机密码
  6. 协程分析之context上下文切换
  7. mysql练习题目试水50题,附建库sql代码
  8. Struts2之校验
  9. 国内常用Linux镜像站点
  10. python——datetime模块