注,reduce之前已经shuff。

mapper.py

#!/usr/bin/env python
"""mapper.py"""

import sys

# input comes from STDIN (standard input)
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()
    # split the line into words
    words = line.split()
    # increase counters
    for word in words:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        print '%s\t%s' % (word, 1)

reducer.py

#!/usr/bin/env python
"""reducer.py"""

from operator import itemgetter
import sys

current_word = None
current_count = 0
word = None

# input comes from STDIN
for line in sys.stdin:
    # remove leading and trailing whitespace
    line = line.strip()

    # parse the input we got from mapper.py
    word, count = line.split('\t', 1)

    # convert count (currently a string) to int
    try:
        count = int(count)
    except ValueError:
        # count was not a number, so silently
        # ignore/discard this line
        continue

    # this IF-switch only works because Hadoop sorts map output
    # by key (here: word) before it is passed to the reducer
    if current_word == word:
        current_count += count
    else:
        if current_word:
            # write result to STDOUT
            print '%s\t%s' % (current_word, current_count)
        current_count = count
        current_word = word

# do not forget to output the last word if needed!
if current_word == word:
    print '%s\t%s' % (current_word, current_count)

Improved Mapper and Reducer code: using Python iterators and generators

mapper.py

#!/usr/bin/env python
"""A more advanced Mapper, using Python iterators and generators."""

import sys

def read_input(file):
    for line in file:
        # split the line into words
        yield line.split()

def main(separator='\t'):
    # input comes from STDIN (standard input)
    data = read_input(sys.stdin)
    for words in data:
        # write the results to STDOUT (standard output);
        # what we output here will be the input for the
        # Reduce step, i.e. the input for reducer.py
        #
        # tab-delimited; the trivial word count is 1
        for word in words:
            print '%s%s%d' % (word, separator, 1)

if __name__ == "__main__":
    main()

reducer.py

#!/usr/bin/env python
"""A more advanced Reducer, using Python iterators and generators."""

from itertools import groupby
from operator import itemgetter
import sys

def read_mapper_output(file, separator='\t'):
    for line in file:
        yield line.rstrip().split(separator, 1)

def main(separator='\t'):
    # input comes from STDIN (standard input)
    data = read_mapper_output(sys.stdin, separator=separator)
    # groupby groups multiple word-count pairs by word,
    # and creates an iterator that returns consecutive keys and their group:
    #   current_word - string containing a word (the key)
    #   group - iterator yielding all ["<current_word>", "<count>"] items
    for current_word, group in groupby(data, itemgetter(0)):
        try:
            total_count = sum(int(count) for current_word, count in group)
            print "%s%s%d" % (current_word, separator, total_count)
        except ValueError:
            # count was not a number, so silently discard this item
            pass

if __name__ == "__main__":
    main()

最新文章

  1. Mono.Android 基础
  2. java中会存在内存泄漏吗,请简单描述。
  3. venus java高并发框架
  4. 使用SharePoint CSOM 编写高效的程序
  5. 虚析构函数(√)、纯虚析构函数(√)、虚构造函数(X)
  6. Codeforces 626D Jerry's Protest 「数学组合」「数学概率」
  7. webstorm查看angular2的ts源码
  8. 【HDOJ】4403 A very hard Aoshu problem
  9. 【C++学习之路】派生类的构造函数(三)
  10. String类的实现(4)写时拷贝浅析
  11. hadoop2的mapreduce操作hbase数据
  12. Delphi基础-数据类型
  13. 在 Xshell 中 使用 hbase shell 进入后 无法删除
  14. SecureCR 控制台输出行数设置
  15. vue上传图片
  16. Redis学习-常用命令
  17. Sql入门学习——基本语法函数
  18. day01_雷神_Python入门
  19. 树莓派上 安装并 运行opencv
  20. qt cmake

热门文章

  1. velocity 自定义工具类接入
  2. 020 Android 常用颜色对应表
  3. EFCore中的导航属性
  4. springmvc流程 struts2 spring Hibernate 优缺点 使用场景介绍
  5. (一)linux 学习 -- 在文件系统中跳转
  6. PB笔记之数据窗体分组合计列
  7. wcf Origin
  8. win10如何安装mariadb
  9. idea+maven使用
  10. STM8 工程模版