#!/usr/bin/env python
import sys
def read_inputs(file):
for line in file:
line = line.strip()
yield line.split()
def main():
file = sys.stdin
lines = read_inputs(file)
for words in lines:
# print words
for word in words:
print("{}\t{}".format(word, 1))
if __name__ == "__main__":
main()
#!/usr/bin/env python
import sys
def read_map_outputs(file):
for line in file:
# print line
yield line.strip().split("\t", 1)
def main():
current_word = None
word_count = 0
lines = read_map_outputs(sys.stdin)
for word, count in lines:
try:
count = int(count)
except ValueError:
continue
if current_word == word:
word_count += count
else:
if current_word:
print("{}\t{}".format(current_word, word_count))
current_word = word
word_count = count
if current_word:
print("{}\t{}".format(current_word, word_count))
if __name__ == "__main__":
main()
echo "test a b c abc \ndef a h" | python python/map.py | sort -k1| python python/reduce.py # streaming 提交运行 job
bin/hadoop jar share/hadoop/tools/lib/hadoop-streaming-2.8.0.jar \
-input wd_data -output wd_data_output -mapper python/wordcount/map.py \
-reducer python/wordcount/reduce.py
hadoop -get /user/username/wd_data_output python/wordcount