2011-10-13 17:38:19 -05:00
|
|
|
// xfail-test - #1038 - Can't do this safely with bare functions
|
|
|
|
|
2011-08-25 17:36:55 -05:00
|
|
|
/**
|
|
|
|
A parallel word-frequency counting program.
|
|
|
|
|
|
|
|
This is meant primarily to demonstrate Rust's MapReduce framework.
|
|
|
|
|
|
|
|
It takes a list of files on the command line and outputs a list of
|
|
|
|
words along with how many times each word is used.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
use std;
|
|
|
|
|
2012-01-31 19:05:20 -06:00
|
|
|
import option = option;
|
2011-12-13 18:25:51 -06:00
|
|
|
import option::some;
|
|
|
|
import option::none;
|
|
|
|
import str;
|
2011-08-25 17:36:55 -05:00
|
|
|
import std::treemap;
|
2011-12-13 18:25:51 -06:00
|
|
|
import vec;
|
2011-08-25 17:36:55 -05:00
|
|
|
import std::io;
|
|
|
|
|
|
|
|
import std::time;
|
2011-12-13 18:25:51 -06:00
|
|
|
import u64;
|
|
|
|
|
|
|
|
import task;
|
|
|
|
import task::joinable_task;
|
|
|
|
import comm;
|
|
|
|
import comm::chan;
|
|
|
|
import comm::port;
|
|
|
|
import comm::recv;
|
|
|
|
import comm::send;
|
2011-08-25 17:36:55 -05:00
|
|
|
|
2011-10-10 06:54:03 -05:00
|
|
|
fn map(&&filename: [u8], emit: map_reduce::putter<[u8], int>) {
|
2012-01-25 02:53:17 -06:00
|
|
|
let f = io::file_reader(str::from_bytes(filename));
|
2011-08-25 17:36:55 -05:00
|
|
|
|
2012-03-09 18:11:56 -06:00
|
|
|
loop {
|
2011-08-25 17:36:55 -05:00
|
|
|
alt read_word(f) {
|
2011-09-01 19:27:58 -05:00
|
|
|
some(w) { emit(str::bytes(w), 1); }
|
2012-01-19 00:37:22 -06:00
|
|
|
none { break; }
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-10 06:54:03 -05:00
|
|
|
fn reduce(&&_word: [u8], get: map_reduce::getter<int>) {
|
2011-08-25 17:36:55 -05:00
|
|
|
let count = 0;
|
|
|
|
|
2012-03-09 18:11:56 -06:00
|
|
|
loop { alt get() { some(_) { count += 1; } none { break; } } }
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
mod map_reduce {
|
|
|
|
export putter;
|
|
|
|
export getter;
|
|
|
|
export mapper;
|
|
|
|
export reducer;
|
|
|
|
export map_reduce;
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
type putter<K: send, V: send> = fn(K, V);
|
2011-08-25 17:36:55 -05:00
|
|
|
|
|
|
|
// FIXME: the first K1 parameter should probably be a -, but that
|
|
|
|
// doesn't parse at the moment.
|
2012-01-05 08:35:37 -06:00
|
|
|
type mapper<K1: send, K2: send, V: send> = fn(K1, putter<K2, V>);
|
2011-08-25 17:36:55 -05:00
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
type getter<V: send> = fn() -> option<V>;
|
2011-08-25 17:36:55 -05:00
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
type reducer<K: send, V: send> = fn(K, getter<V>);
|
2011-08-25 17:36:55 -05:00
|
|
|
|
2012-01-19 18:05:33 -06:00
|
|
|
enum ctrl_proto<K: send, V: send> {
|
2011-08-25 17:36:55 -05:00
|
|
|
find_reducer(K, chan<chan<reduce_proto<V>>>);
|
|
|
|
mapper_done;
|
|
|
|
}
|
|
|
|
|
2012-01-19 18:05:33 -06:00
|
|
|
enum reduce_proto<V: send> { emit_val(V); done; ref; release; }
|
2011-08-25 17:36:55 -05:00
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn start_mappers<K1: send, K2: send,
|
|
|
|
V: send>(map: mapper<K1, K2, V>,
|
2011-09-12 04:27:30 -05:00
|
|
|
ctrl: chan<ctrl_proto<K2, V>>, inputs: [K1]) ->
|
2011-09-02 17:34:58 -05:00
|
|
|
[joinable_task] {
|
2011-08-25 17:36:55 -05:00
|
|
|
let tasks = [];
|
2012-04-06 13:01:43 -05:00
|
|
|
for inputs.each {|i|
|
2011-08-25 17:36:55 -05:00
|
|
|
let m = map, c = ctrl, ii = i;
|
2012-01-06 22:55:56 -06:00
|
|
|
tasks += [task::spawn_joinable {|| map_task(m, c, ii)}];
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
ret tasks;
|
|
|
|
}
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn map_task<K: send1, K: send2,
|
|
|
|
V: send>(-map: mapper<K1, K2, V>,
|
2011-10-25 08:56:55 -05:00
|
|
|
-ctrl: chan<ctrl_proto<K2, V>>,
|
2011-09-12 05:39:38 -05:00
|
|
|
-input: K1) {
|
2011-12-22 19:53:53 -06:00
|
|
|
// log(error, "map_task " + input);
|
2011-08-25 17:36:55 -05:00
|
|
|
let intermediates = treemap::init();
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn emit<K: send2,
|
|
|
|
V: send>(im: treemap::treemap<K2, chan<reduce_proto<V>>>,
|
2011-09-12 04:27:30 -05:00
|
|
|
ctrl: chan<ctrl_proto<K2, V>>, key: K2, val: V) {
|
2011-08-25 17:36:55 -05:00
|
|
|
let c;
|
|
|
|
alt treemap::find(im, key) {
|
2011-09-15 02:48:39 -05:00
|
|
|
some(_c) { c = _c; }
|
2012-01-19 00:37:22 -06:00
|
|
|
none {
|
2011-08-25 17:36:55 -05:00
|
|
|
let p = port();
|
|
|
|
send(ctrl, find_reducer(key, chan(p)));
|
|
|
|
c = recv(p);
|
|
|
|
treemap::insert(im, key, c);
|
|
|
|
send(c, ref);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
send(c, emit_val(val));
|
|
|
|
}
|
|
|
|
|
|
|
|
map(input, bind emit(intermediates, ctrl, _, _));
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn finish<K: send, V: send>(_k: K, v: chan<reduce_proto<V>>) {
|
2011-08-25 17:36:55 -05:00
|
|
|
send(v, release);
|
|
|
|
}
|
|
|
|
treemap::traverse(intermediates, finish);
|
|
|
|
send(ctrl, mapper_done);
|
|
|
|
}
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn reduce_task<K: send,
|
|
|
|
V: send>(-reduce: reducer<K, V>, -key: K,
|
2011-09-12 05:39:38 -05:00
|
|
|
-out: chan<chan<reduce_proto<V>>>) {
|
2011-08-25 17:36:55 -05:00
|
|
|
let p = port();
|
|
|
|
|
|
|
|
send(out, chan(p));
|
|
|
|
|
|
|
|
let ref_count = 0;
|
|
|
|
let is_done = false;
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn get<V: send>(p: port<reduce_proto<V>>,
|
2011-10-25 08:56:55 -05:00
|
|
|
&ref_count: int, &is_done: bool)
|
2011-09-12 05:39:38 -05:00
|
|
|
-> option<V> {
|
2011-08-25 17:36:55 -05:00
|
|
|
while !is_done || ref_count > 0 {
|
|
|
|
alt recv(p) {
|
|
|
|
emit_val(v) {
|
2011-12-22 16:42:52 -06:00
|
|
|
// #error("received %d", v);
|
2011-08-25 17:36:55 -05:00
|
|
|
ret some(v);
|
|
|
|
}
|
2012-01-19 00:37:22 -06:00
|
|
|
done {
|
2011-12-22 16:42:52 -06:00
|
|
|
// #error("all done");
|
2011-08-25 17:36:55 -05:00
|
|
|
is_done = true;
|
|
|
|
}
|
|
|
|
ref. { ref_count += 1; }
|
|
|
|
release. { ref_count -= 1; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret none;
|
|
|
|
}
|
|
|
|
|
|
|
|
reduce(key, bind get(p, ref_count, is_done));
|
|
|
|
}
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn map_reduce<K: send1, K: send2,
|
|
|
|
V: send>(map: mapper<K1, K2, V>, reduce: reducer<K2, V>,
|
2011-09-12 04:27:30 -05:00
|
|
|
inputs: [K1]) {
|
2011-08-25 17:36:55 -05:00
|
|
|
let ctrl = port();
|
|
|
|
|
|
|
|
// This task becomes the master control task. It task::_spawns
|
|
|
|
// to do the rest.
|
|
|
|
|
|
|
|
let reducers = treemap::init();
|
|
|
|
|
|
|
|
let tasks = start_mappers(map, chan(ctrl), inputs);
|
|
|
|
|
|
|
|
let num_mappers = vec::len(inputs) as int;
|
|
|
|
|
|
|
|
while num_mappers > 0 {
|
|
|
|
alt recv(ctrl) {
|
2012-01-19 00:37:22 -06:00
|
|
|
mapper_done {
|
2011-12-22 16:42:52 -06:00
|
|
|
// #error("received mapper terminated.");
|
2011-08-25 17:36:55 -05:00
|
|
|
num_mappers -= 1;
|
|
|
|
}
|
|
|
|
find_reducer(k, cc) {
|
|
|
|
let c;
|
2011-12-22 19:53:53 -06:00
|
|
|
// log(error, "finding reducer for " + k);
|
2011-08-25 17:36:55 -05:00
|
|
|
alt treemap::find(reducers, k) {
|
|
|
|
some(_c) {
|
2011-12-22 19:53:53 -06:00
|
|
|
// log(error,
|
2011-12-22 16:42:52 -06:00
|
|
|
// "reusing existing reducer for " + k);
|
2011-08-25 17:36:55 -05:00
|
|
|
c = _c;
|
|
|
|
}
|
2012-01-19 00:37:22 -06:00
|
|
|
none {
|
2011-12-22 19:53:53 -06:00
|
|
|
// log(error, "creating new reducer for " + k);
|
2011-08-25 17:36:55 -05:00
|
|
|
let p = port();
|
2012-01-06 22:55:56 -06:00
|
|
|
let ch = chan(p);
|
2011-08-25 17:36:55 -05:00
|
|
|
let r = reduce, kk = k;
|
2012-01-06 22:55:56 -06:00
|
|
|
tasks += [
|
|
|
|
task::spawn_joinable {|| reduce_task(r, kk, ch) }
|
|
|
|
];
|
2011-08-25 17:36:55 -05:00
|
|
|
c = recv(p);
|
|
|
|
treemap::insert(reducers, k, c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
send(cc, c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-01-05 08:35:37 -06:00
|
|
|
fn finish<K: send, V: send>(_k: K, v: chan<reduce_proto<V>>) {
|
2011-10-25 08:56:55 -05:00
|
|
|
send(v, done);
|
|
|
|
}
|
2011-08-25 17:36:55 -05:00
|
|
|
treemap::traverse(reducers, finish);
|
|
|
|
|
2012-04-06 13:01:43 -05:00
|
|
|
for tasks.each {|t| task::join(t); }
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-02 17:34:58 -05:00
|
|
|
fn main(argv: [str]) {
|
2011-08-25 17:36:55 -05:00
|
|
|
if vec::len(argv) < 2u {
|
|
|
|
let out = io::stdout();
|
|
|
|
|
2011-09-02 17:34:58 -05:00
|
|
|
out.write_line(#fmt["Usage: %s <filename> ...", argv[0]]);
|
2011-08-25 17:36:55 -05:00
|
|
|
|
|
|
|
// TODO: run something just to make sure the code hasn't
|
|
|
|
// broken yet. This is the unit test mode of this program.
|
|
|
|
|
|
|
|
ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
let iargs = [];
|
2012-04-06 13:01:43 -05:00
|
|
|
vec::iter_between(argv, 1u, vec::len(argv)) {|a|
|
2011-09-01 19:27:58 -05:00
|
|
|
iargs += [str::bytes(a)];
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
let start = time::precise_time_ns();
|
|
|
|
|
|
|
|
map_reduce::map_reduce(map, reduce, iargs);
|
|
|
|
let stop = time::precise_time_ns();
|
|
|
|
|
|
|
|
let elapsed = stop - start;
|
|
|
|
elapsed /= 1000000u64;
|
|
|
|
|
2011-12-22 19:53:53 -06:00
|
|
|
log(error, "MapReduce completed in "
|
2011-12-22 16:42:52 -06:00
|
|
|
+ u64::str(elapsed) + "ms");
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
|
2011-09-02 17:34:58 -05:00
|
|
|
fn read_word(r: io::reader) -> option<str> {
|
|
|
|
let w = "";
|
2011-08-25 17:36:55 -05:00
|
|
|
|
|
|
|
while !r.eof() {
|
|
|
|
let c = r.read_char();
|
|
|
|
|
|
|
|
|
|
|
|
if is_word_char(c) {
|
2011-09-01 19:27:58 -05:00
|
|
|
w += str::from_char(c);
|
2011-09-02 17:34:58 -05:00
|
|
|
} else { if w != "" { ret some(w); } }
|
2011-08-25 17:36:55 -05:00
|
|
|
}
|
|
|
|
ret none;
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_digit(c: char) -> bool {
|
|
|
|
alt c {
|
|
|
|
'0' { true }
|
|
|
|
'1' { true }
|
|
|
|
'2' { true }
|
|
|
|
'3' { true }
|
|
|
|
'4' { true }
|
|
|
|
'5' { true }
|
|
|
|
'6' { true }
|
|
|
|
'7' { true }
|
|
|
|
'8' { true }
|
|
|
|
'9' { true }
|
|
|
|
_ { false }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_alpha_lower(c: char) -> bool {
|
|
|
|
alt c {
|
|
|
|
'a' { true }
|
|
|
|
'b' { true }
|
|
|
|
'c' { true }
|
|
|
|
'd' { true }
|
|
|
|
'e' { true }
|
|
|
|
'f' { true }
|
|
|
|
'g' { true }
|
|
|
|
'h' { true }
|
|
|
|
'i' { true }
|
|
|
|
'j' { true }
|
|
|
|
'k' { true }
|
|
|
|
'l' { true }
|
|
|
|
'm' { true }
|
|
|
|
'n' { true }
|
|
|
|
'o' { true }
|
|
|
|
'p' { true }
|
|
|
|
'q' { true }
|
|
|
|
'r' { true }
|
|
|
|
's' { true }
|
|
|
|
't' { true }
|
|
|
|
'u' { true }
|
|
|
|
'v' { true }
|
|
|
|
'w' { true }
|
|
|
|
'x' { true }
|
|
|
|
'y' { true }
|
|
|
|
'z' { true }
|
|
|
|
_ { false }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_alpha_upper(c: char) -> bool {
|
|
|
|
alt c {
|
|
|
|
'A' { true }
|
|
|
|
'B' { true }
|
|
|
|
'C' { true }
|
|
|
|
'D' { true }
|
|
|
|
'E' { true }
|
|
|
|
'F' { true }
|
|
|
|
'G' { true }
|
|
|
|
'H' { true }
|
|
|
|
'I' { true }
|
|
|
|
'J' { true }
|
|
|
|
'K' { true }
|
|
|
|
'L' { true }
|
|
|
|
'M' { true }
|
|
|
|
'N' { true }
|
|
|
|
'O' { true }
|
|
|
|
'P' { true }
|
|
|
|
'Q' { true }
|
|
|
|
'R' { true }
|
|
|
|
'S' { true }
|
|
|
|
'T' { true }
|
|
|
|
'U' { true }
|
|
|
|
'V' { true }
|
|
|
|
'W' { true }
|
|
|
|
'X' { true }
|
|
|
|
'Y' { true }
|
|
|
|
'Z' { true }
|
|
|
|
_ { false }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_alpha(c: char) -> bool { is_alpha_upper(c) || is_alpha_lower(c) }
|
|
|
|
|
|
|
|
fn is_word_char(c: char) -> bool { is_alpha(c) || is_digit(c) || c == '_' }
|