Merge pull request #128 from miguelraz/dotprodexample
add dot_product example
This commit is contained in:
commit
582239ac3b
13
crates/core_simd/examples/README.md
Normal file
13
crates/core_simd/examples/README.md
Normal file
@ -0,0 +1,13 @@
|
||||
### `stdsimd` examples
|
||||
|
||||
This crate is a port of example uses of `stdsimd`, mostly taken from the `packed_simd` crate.
|
||||
|
||||
The examples contain, as in the case of `dot_product.rs`, multiple ways of solving the problem, in order to show idiomatic uses of SIMD and iteration of performance designs.
|
||||
|
||||
Run the tests with the command
|
||||
|
||||
```
|
||||
cargo run --example dot_product
|
||||
```
|
||||
|
||||
and verify the code for `dot_product.rs` on your machine.
|
169
crates/core_simd/examples/dot_product.rs
Normal file
169
crates/core_simd/examples/dot_product.rs
Normal file
@ -0,0 +1,169 @@
|
||||
// Code taken from the `packed_simd` crate
|
||||
// Run this code with `cargo test --example dot_product`
|
||||
//use std::iter::zip;
|
||||
|
||||
#![feature(array_chunks)]
|
||||
#![feature(slice_as_chunks)]
|
||||
// Add these imports to use the stdsimd library
|
||||
#![feature(portable_simd)]
|
||||
use core_simd::simd::*;
|
||||
|
||||
// This is your barebones dot product implementation:
|
||||
// Take 2 vectors, multiply them element wise and *then*
|
||||
// go along the resulting array and add up the result.
|
||||
// In the next example we will see if there
|
||||
// is any difference to adding and multiplying in tandem.
|
||||
pub fn dot_prod_scalar_0(a: &[f32], b: &[f32]) -> f32 {
|
||||
assert_eq!(a.len(), b.len());
|
||||
|
||||
a.iter().zip(b.iter()).map(|(a, b)| a * b).sum()
|
||||
}
|
||||
|
||||
// When dealing with SIMD, it is very important to think about the amount
|
||||
// of data movement and when it happens. We're going over simple computation examples here, and yet
|
||||
// it is not trivial to understand what may or may not contribute to performance
|
||||
// changes. Eventually, you will need tools to inspect the generated assembly and confirm your
|
||||
// hypothesis and benchmarks - we will mention them later on.
|
||||
// With the use of `fold`, we're doing a multiplication,
|
||||
// and then adding it to the sum, one element from both vectors at a time.
|
||||
pub fn dot_prod_scalar_1(a: &[f32], b: &[f32]) -> f32 {
|
||||
assert_eq!(a.len(), b.len());
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.fold(0.0, |a, zipped| a + zipped.0 * zipped.1)
|
||||
}
|
||||
|
||||
// We now move on to the SIMD implementations: notice the following constructs:
|
||||
// `array_chunks::<4>`: mapping this over the vector will let use construct SIMD vectors
|
||||
// `f32x4::from_array`: construct the SIMD vector from a slice
|
||||
// `(a * b).reduce_sum()`: Multiply both f32x4 vectors together, and then reduce them.
|
||||
// This approach essentially uses SIMD to produce a vector of length N/4 of all the products,
|
||||
// and then add those with `sum()`. This is suboptimal.
|
||||
// TODO: ASCII diagrams
|
||||
pub fn dot_prod_simd_0(a: &[f32], b: &[f32]) -> f32 {
|
||||
assert_eq!(a.len(), b.len());
|
||||
// TODO handle remainder when a.len() % 4 != 0
|
||||
a.array_chunks::<4>()
|
||||
.map(|&a| f32x4::from_array(a))
|
||||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
|
||||
.map(|(a, b)| (a * b).reduce_sum())
|
||||
.sum()
|
||||
}
|
||||
|
||||
// There's some simple ways to improve the previous code:
|
||||
// 1. Make a `zero` `f32x4` SIMD vector that we will be accumulating into
|
||||
// So that there is only one `sum()` reduction when the last `f32x4` has been processed
|
||||
// 2. Exploit Fused Multiply Add so that the multiplication, addition and sinking into the reduciton
|
||||
// happen in the same step.
|
||||
// If the arrays are large, minimizing the data shuffling will lead to great perf.
|
||||
// If the arrays are small, handling the remainder elements when the length isn't a multiple of 4
|
||||
// Can become a problem.
|
||||
pub fn dot_prod_simd_1(a: &[f32], b: &[f32]) -> f32 {
|
||||
assert_eq!(a.len(), b.len());
|
||||
// TODO handle remainder when a.len() % 4 != 0
|
||||
a.array_chunks::<4>()
|
||||
.map(|&a| f32x4::from_array(a))
|
||||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
|
||||
.fold(f32x4::splat(0.0), |acc, zipped| acc + zipped.0 * zipped.1)
|
||||
.reduce_sum()
|
||||
}
|
||||
|
||||
// A lot of knowledgeable use of SIMD comes from knowing specific instructions that are
|
||||
// available - let's try to use the `mul_add` instruction, which is the fused-multiply-add we were looking for.
|
||||
use std_float::StdFloat;
|
||||
pub fn dot_prod_simd_2(a: &[f32], b: &[f32]) -> f32 {
|
||||
assert_eq!(a.len(), b.len());
|
||||
// TODO handle remainder when a.len() % 4 != 0
|
||||
let mut res = f32x4::splat(0.0);
|
||||
a.array_chunks::<4>()
|
||||
.map(|&a| f32x4::from_array(a))
|
||||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
|
||||
.for_each(|(a, b)| {
|
||||
res = a.mul_add(b, res);
|
||||
});
|
||||
res.reduce_sum()
|
||||
}
|
||||
|
||||
// Finally, we will write the same operation but handling the loop remainder.
|
||||
const LANES: usize = 4;
|
||||
pub fn dot_prod_simd_3(a: &[f32], b: &[f32]) -> f32 {
|
||||
assert_eq!(a.len(), b.len());
|
||||
|
||||
let (a_extra, a_chunks) = a.as_rchunks();
|
||||
let (b_extra, b_chunks) = b.as_rchunks();
|
||||
|
||||
// These are always true, but for emphasis:
|
||||
assert_eq!(a_chunks.len(), b_chunks.len());
|
||||
assert_eq!(a_extra.len(), b_extra.len());
|
||||
|
||||
let mut sums = [0.0; LANES];
|
||||
for ((x, y), d) in std::iter::zip(a_extra, b_extra).zip(&mut sums) {
|
||||
*d = x * y;
|
||||
}
|
||||
|
||||
let mut sums = f32x4::from_array(sums);
|
||||
std::iter::zip(a_chunks, b_chunks).for_each(|(x, y)| {
|
||||
sums += f32x4::from_array(*x) * f32x4::from_array(*y);
|
||||
});
|
||||
|
||||
sums.reduce_sum()
|
||||
}
|
||||
|
||||
// Finally, we present an iterator version for handling remainders in a scalar fashion at the end of the loop.
|
||||
// Unfortunately, this is allocating 1 `XMM` register on the order of `~len(a)` - we'll see how we can get around it in the
|
||||
// next example.
|
||||
pub fn dot_prod_simd_4(a: &[f32], b: &[f32]) -> f32 {
|
||||
let mut sum = a
|
||||
.array_chunks::<4>()
|
||||
.map(|&a| f32x4::from_array(a))
|
||||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
|
||||
.map(|(a, b)| a * b)
|
||||
.fold(f32x4::splat(0.0), std::ops::Add::add)
|
||||
.reduce_sum();
|
||||
let remain = a.len() - (a.len() % 4);
|
||||
sum += a[remain..]
|
||||
.iter()
|
||||
.zip(&b[remain..])
|
||||
.map(|(a, b)| a * b)
|
||||
.sum::<f32>();
|
||||
sum
|
||||
}
|
||||
|
||||
// This version allocates a single `XMM` register for accumulation, and the folds don't allocate on top of that.
|
||||
// Notice the the use of `mul_add`, which can do a multiply and an add operation ber iteration.
|
||||
pub fn dot_prod_simd_5(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.array_chunks::<4>()
|
||||
.map(|&a| f32x4::from_array(a))
|
||||
.zip(b.array_chunks::<4>().map(|&b| f32x4::from_array(b)))
|
||||
.fold(f32x4::splat(0.), |acc, (a, b)| a.mul_add(b, acc))
|
||||
.reduce_sum()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Empty main to make cargo happy
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn smoke_test() {
|
||||
use super::*;
|
||||
let a: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
|
||||
let b: Vec<f32> = vec![-8.0, -7.0, -6.0, -5.0, 4.0, 3.0, 2.0, 1.0];
|
||||
let x: Vec<f32> = [0.5; 1003].to_vec();
|
||||
let y: Vec<f32> = [2.0; 1003].to_vec();
|
||||
|
||||
// Basic check
|
||||
assert_eq!(0.0, dot_prod_scalar_0(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_scalar_1(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_simd_0(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_simd_1(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_simd_2(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_simd_3(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_simd_4(&a, &b));
|
||||
assert_eq!(0.0, dot_prod_simd_5(&a, &b));
|
||||
|
||||
// We can handle vectors that are non-multiples of 4
|
||||
assert_eq!(1003.0, dot_prod_simd_3(&x, &y));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user