2011-07-08 19:33:49 -05:00
|
|
|
/**
|
|
|
|
A parallel word-frequency counting program.
|
|
|
|
|
|
|
|
This is meant primarily to demonstrate Rust's MapReduce framework.
|
|
|
|
|
|
|
|
It takes a list of files on the command line and outputs a list of
|
|
|
|
words along with how many times each word is used.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
use std;
|
|
|
|
|
|
|
|
import option = std::option::t;
|
|
|
|
import std::option::some;
|
|
|
|
import std::option::none;
|
|
|
|
import std::str;
|
|
|
|
import std::map;
|
2011-07-21 14:11:05 -05:00
|
|
|
import std::ivec;
|
2011-08-11 21:14:38 -05:00
|
|
|
import std::io;
|
2011-07-21 14:11:05 -05:00
|
|
|
|
|
|
|
import std::time;
|
|
|
|
import std::u64;
|
|
|
|
|
|
|
|
import std::task;
|
2011-08-13 18:03:28 -05:00
|
|
|
import std::task::task_id;
|
|
|
|
import std::comm;
|
|
|
|
import std::comm::_chan;
|
|
|
|
import std::comm::_port;
|
|
|
|
import std::comm::mk_port;
|
|
|
|
import std::comm::send;
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn map(filename: str, emit: map_reduce::putter) {
|
2011-08-11 21:14:38 -05:00
|
|
|
let f = io::file_reader(filename);
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
|
|
|
|
while true {
|
|
|
|
alt read_word(f) { some(w) { emit(w, 1); } none. { break; } }
|
2011-07-12 13:13:15 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn reduce(word: str, get: map_reduce::getter) {
|
|
|
|
let count = 0;
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
|
|
|
|
while true {
|
|
|
|
alt get() {
|
|
|
|
some(_) {
|
|
|
|
count += 1;
|
|
|
|
}
|
|
|
|
none. { break }
|
2011-07-12 13:13:15 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-08 19:33:49 -05:00
|
|
|
mod map_reduce {
|
|
|
|
export putter;
|
|
|
|
export getter;
|
|
|
|
export mapper;
|
|
|
|
export reducer;
|
|
|
|
export map_reduce;
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
type putter = fn(str, int) ;
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
type mapper = fn(str, putter) ;
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
type getter = fn() -> option[int] ;
|
2011-07-13 17:44:09 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
type reducer = fn(str, getter) ;
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-12 13:13:15 -05:00
|
|
|
tag ctrl_proto {
|
2011-08-13 18:03:28 -05:00
|
|
|
find_reducer([u8], _chan[_chan[reduce_proto]]);
|
2011-07-12 13:13:15 -05:00
|
|
|
mapper_done;
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
tag reduce_proto { emit_val(int); done; ref; release; }
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
fn start_mappers(ctrl: _chan[ctrl_proto], inputs: &[str]) -> [task_id] {
|
2011-08-15 11:52:18 -05:00
|
|
|
let tasks = ~[];
|
2011-07-27 07:19:39 -05:00
|
|
|
for i: str in inputs {
|
2011-08-13 18:03:28 -05:00
|
|
|
tasks += ~[task::_spawn(bind map_task(ctrl, i))];
|
2011-07-12 13:13:15 -05:00
|
|
|
}
|
2011-07-21 14:11:05 -05:00
|
|
|
ret tasks;
|
2011-07-12 13:13:15 -05:00
|
|
|
}
|
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
fn map_task(ctrl: _chan[ctrl_proto], input: str) {
|
2011-07-21 14:11:05 -05:00
|
|
|
// log_err "map_task " + input;
|
2011-07-27 07:19:39 -05:00
|
|
|
let intermediates = map::new_str_hash();
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
fn emit(im: &map::hashmap[str, _chan[reduce_proto]],
|
|
|
|
ctrl: _chan[ctrl_proto], key: str, val: int) {
|
2011-07-27 07:19:39 -05:00
|
|
|
let c;
|
|
|
|
alt im.find(key) {
|
|
|
|
some(_c) {
|
|
|
|
|
|
|
|
c = _c
|
|
|
|
}
|
|
|
|
none. {
|
2011-08-13 18:03:28 -05:00
|
|
|
let p = mk_port[_chan[reduce_proto]]();
|
2011-08-11 18:36:20 -05:00
|
|
|
let keyi = str::bytes(key);
|
2011-08-13 18:03:28 -05:00
|
|
|
send(ctrl, find_reducer(keyi, p.mk_chan()));
|
|
|
|
c = p.recv();
|
|
|
|
im.insert(key, c);
|
|
|
|
send(c, ref);
|
2011-07-27 07:19:39 -05:00
|
|
|
}
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
2011-08-13 18:03:28 -05:00
|
|
|
send(c, emit_val(val));
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
|
2011-07-12 13:13:15 -05:00
|
|
|
map(input, bind emit(intermediates, ctrl, _, _));
|
2011-07-21 14:11:05 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
for each kv: @{key: str, val: _chan[reduce_proto]} in
|
2011-07-27 07:19:39 -05:00
|
|
|
intermediates.items() {
|
2011-08-13 18:03:28 -05:00
|
|
|
send(kv.val, release);
|
2011-07-21 14:11:05 -05:00
|
|
|
}
|
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
send(ctrl, mapper_done);
|
2011-07-12 13:13:15 -05:00
|
|
|
}
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
fn reduce_task(key: str, out: _chan[_chan[reduce_proto]]) {
|
|
|
|
let p = mk_port();
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
send(out, p.mk_chan());
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
let ref_count = 0;
|
|
|
|
let is_done = false;
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
fn get(p: &_port[reduce_proto], ref_count: &mutable int,
|
2011-07-27 07:19:39 -05:00
|
|
|
is_done: &mutable bool) -> option[int] {
|
|
|
|
while !is_done || ref_count > 0 {
|
2011-08-13 18:03:28 -05:00
|
|
|
alt p.recv() {
|
2011-07-27 07:19:39 -05:00
|
|
|
emit_val(v) {
|
|
|
|
// log_err #fmt("received %d", v);
|
|
|
|
ret some(v);
|
|
|
|
}
|
|
|
|
done. {
|
|
|
|
// log_err "all done";
|
|
|
|
is_done = true;
|
|
|
|
}
|
|
|
|
ref. { ref_count += 1; }
|
|
|
|
release. { ref_count -= 1; }
|
2011-07-21 14:11:05 -05:00
|
|
|
}
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
2011-07-21 14:11:05 -05:00
|
|
|
ret none;
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
|
2011-07-21 14:11:05 -05:00
|
|
|
reduce(key, bind get(p, ref_count, is_done));
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
|
2011-08-11 23:37:27 -05:00
|
|
|
fn map_reduce(inputs: &[str]) {
|
2011-08-13 18:03:28 -05:00
|
|
|
let ctrl = mk_port[ctrl_proto]();
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
// This task becomes the master control task. It task::_spawns
|
2011-07-12 13:13:15 -05:00
|
|
|
// to do the rest.
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
let reducers: map::hashmap[str, _chan[reduce_proto]];
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-12 13:13:15 -05:00
|
|
|
reducers = map::new_str_hash();
|
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
let tasks = start_mappers(ctrl.mk_chan(), inputs);
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-08-11 23:37:27 -05:00
|
|
|
let num_mappers = ivec::len(inputs) as int;
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
while num_mappers > 0 {
|
2011-08-13 18:03:28 -05:00
|
|
|
alt ctrl.recv() {
|
2011-07-27 07:19:39 -05:00
|
|
|
mapper_done. {
|
|
|
|
// log_err "received mapper terminated.";
|
|
|
|
num_mappers -= 1;
|
|
|
|
}
|
|
|
|
find_reducer(ki, cc) {
|
|
|
|
let c;
|
2011-08-11 19:13:53 -05:00
|
|
|
let k = str::unsafe_from_bytes(ki);
|
2011-07-27 07:19:39 -05:00
|
|
|
// log_err "finding reducer for " + k;
|
|
|
|
alt reducers.find(k) {
|
|
|
|
some(_c) {
|
|
|
|
// log_err "reusing existing reducer for " + k;
|
|
|
|
c = _c;
|
|
|
|
}
|
|
|
|
none. {
|
|
|
|
// log_err "creating new reducer for " + k;
|
2011-08-13 18:03:28 -05:00
|
|
|
let p = mk_port();
|
2011-08-15 11:52:18 -05:00
|
|
|
tasks +=
|
|
|
|
~[task::_spawn(bind reduce_task(k, p.mk_chan()))];
|
2011-08-13 18:03:28 -05:00
|
|
|
c = p.recv();
|
2011-07-27 07:19:39 -05:00
|
|
|
reducers.insert(k, c);
|
|
|
|
}
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
2011-08-13 18:03:28 -05:00
|
|
|
send(cc, c);
|
2011-07-27 07:19:39 -05:00
|
|
|
}
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
for each kv: @{key: str, val: _chan[reduce_proto]} in reducers.items()
|
2011-07-27 07:19:39 -05:00
|
|
|
{
|
2011-08-13 18:03:28 -05:00
|
|
|
send(kv.val, done);
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
2011-07-21 14:11:05 -05:00
|
|
|
|
2011-08-13 18:03:28 -05:00
|
|
|
for t in tasks { task::join_id(t); }
|
2011-07-12 13:13:15 -05:00
|
|
|
}
|
|
|
|
}
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn main(argv: vec[str]) {
|
2011-08-11 23:37:27 -05:00
|
|
|
let iargv = ivec::from_vec(argv);
|
|
|
|
if ivec::len(iargv) < 2u {
|
2011-08-11 21:14:38 -05:00
|
|
|
let out = io::stdout();
|
2011-07-12 13:13:15 -05:00
|
|
|
|
2011-08-11 23:37:27 -05:00
|
|
|
out.write_line(#fmt("Usage: %s <filename> ...", iargv.(0)));
|
2011-07-25 17:02:43 -05:00
|
|
|
|
|
|
|
// TODO: run something just to make sure the code hasn't
|
|
|
|
// broken yet. This is the unit test mode of this program.
|
|
|
|
|
|
|
|
ret;
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
|
2011-07-25 17:02:43 -05:00
|
|
|
// We can get by with 8k stacks, and we'll probably exhaust our
|
|
|
|
// address space otherwise.
|
|
|
|
task::set_min_stack(8192u);
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
let start = time::precise_time_ns();
|
2011-07-25 17:02:43 -05:00
|
|
|
|
2011-08-11 23:37:27 -05:00
|
|
|
map_reduce::map_reduce(ivec::slice(iargv, 1u, ivec::len(iargv)));
|
2011-07-27 07:19:39 -05:00
|
|
|
let stop = time::precise_time_ns();
|
2011-07-21 14:11:05 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
let elapsed = stop - start;
|
2011-07-21 14:11:05 -05:00
|
|
|
elapsed /= 1000000u64;
|
|
|
|
|
|
|
|
log_err "MapReduce completed in " + u64::str(elapsed) + "ms";
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
|
2011-08-11 21:14:38 -05:00
|
|
|
fn read_word(r: io::reader) -> option[str] {
|
2011-07-27 07:19:39 -05:00
|
|
|
let w = "";
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
while !r.eof() {
|
|
|
|
let c = r.read_char();
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
|
|
|
|
if is_word_char(c) {
|
2011-07-08 19:33:49 -05:00
|
|
|
w += str::from_char(c);
|
2011-07-27 07:19:39 -05:00
|
|
|
} else { if w != "" { ret some(w); } }
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
ret none;
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn is_digit(c: char) -> bool {
|
|
|
|
alt c {
|
|
|
|
'0' { true }
|
|
|
|
'1' { true }
|
|
|
|
'2' { true }
|
|
|
|
'3' { true }
|
|
|
|
'4' { true }
|
|
|
|
'5' { true }
|
|
|
|
'6' { true }
|
|
|
|
'7' { true }
|
|
|
|
'8' { true }
|
|
|
|
'9' { true }
|
|
|
|
_ { false }
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn is_alpha_lower(c: char) -> bool {
|
|
|
|
alt c {
|
|
|
|
'a' { true }
|
|
|
|
'b' { true }
|
|
|
|
'c' { true }
|
|
|
|
'd' { true }
|
|
|
|
'e' { true }
|
|
|
|
'f' { true }
|
|
|
|
'g' { true }
|
|
|
|
'h' { true }
|
|
|
|
'i' { true }
|
|
|
|
'j' { true }
|
|
|
|
'k' { true }
|
|
|
|
'l' { true }
|
|
|
|
'm' { true }
|
|
|
|
'n' { true }
|
|
|
|
'o' { true }
|
|
|
|
'p' { true }
|
|
|
|
'q' { true }
|
|
|
|
'r' { true }
|
|
|
|
's' { true }
|
|
|
|
't' { true }
|
|
|
|
'u' { true }
|
|
|
|
'v' { true }
|
|
|
|
'w' { true }
|
|
|
|
'x' { true }
|
|
|
|
'y' { true }
|
|
|
|
'z' { true }
|
|
|
|
_ { false }
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn is_alpha_upper(c: char) -> bool {
|
|
|
|
alt c {
|
|
|
|
'A' { true }
|
|
|
|
'B' { true }
|
|
|
|
'C' { true }
|
|
|
|
'D' { true }
|
|
|
|
'E' { true }
|
|
|
|
'F' { true }
|
|
|
|
'G' { true }
|
|
|
|
'H' { true }
|
|
|
|
'I' { true }
|
|
|
|
'J' { true }
|
|
|
|
'K' { true }
|
|
|
|
'L' { true }
|
|
|
|
'M' { true }
|
|
|
|
'N' { true }
|
|
|
|
'O' { true }
|
|
|
|
'P' { true }
|
|
|
|
'Q' { true }
|
|
|
|
'R' { true }
|
|
|
|
'S' { true }
|
|
|
|
'T' { true }
|
|
|
|
'U' { true }
|
|
|
|
'V' { true }
|
|
|
|
'W' { true }
|
|
|
|
'X' { true }
|
|
|
|
'Y' { true }
|
|
|
|
'Z' { true }
|
|
|
|
_ { false }
|
2011-07-08 19:33:49 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:19:39 -05:00
|
|
|
fn is_alpha(c: char) -> bool { is_alpha_upper(c) || is_alpha_lower(c) }
|
2011-07-08 19:33:49 -05:00
|
|
|
|
2011-07-25 17:02:43 -05:00
|
|
|
fn is_word_char(c: char) -> bool { is_alpha(c) || is_digit(c) || c == '_' }
|