From 8872163b32011dd546e69d349d9c5de22cc218b8 Mon Sep 17 00:00:00 2001
From: Ruud van Asseldonk <ruuda@google.com>
Date: Sat, 5 Mar 2016 15:52:08 +0100
Subject: [PATCH 1/3] Define x86 fused multiply-add intrinsics

This defines the following intrinsics for 128 and 256 bit vectors of f32
and f64:

 * `fmadd`
 * `fmaddsub`
 * `fmsub`
 * `fmsubadd`
 * `fnmadd`
 * `fnmsub`

The `_sd` and `_ss` variants are not included yet.

Intel intrinsic reference: https://software.intel.com/en-us/node/523929

The intrinsics there are listed under AVX2, but in the Intel Intrinsic
Guide they are part of the "FMA" technology, and LLVM puts them under
FMA, not AVX2.
---
 src/etc/platform-intrinsics/x86/fma.json | 47 ++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 src/etc/platform-intrinsics/x86/fma.json

diff --git a/src/etc/platform-intrinsics/x86/fma.json b/src/etc/platform-intrinsics/x86/fma.json
new file mode 100644
index 00000000000..c922d166c8f
--- /dev/null
+++ b/src/etc/platform-intrinsics/x86/fma.json
@@ -0,0 +1,47 @@
+{
+    "llvm_prefix": "llvm.x86.fma.",
+    "intrinsics": [
+        {
+            "intrinsic": "{0.width_mm}_fmadd_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "vfmadd.{0.data_type_short}{0.width_suffix}",
+            "ret": "f(32-64)",
+            "args": ["0", "0", "0"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_fmaddsub_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "vfmaddsub.{0.data_type_short}{0.width_suffix}",
+            "ret": "f(32-64)",
+            "args": ["0", "0", "0"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_fmsub_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "vfmsub.{0.data_type_short}{0.width_suffix}",
+            "ret": "f(32-64)",
+            "args": ["0", "0", "0"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_fmsubadd_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "vfmsubadd.{0.data_type_short}{0.width_suffix}",
+            "ret": "f(32-64)",
+            "args": ["0", "0", "0"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_fnmadd_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "vfnmadd.{0.data_type_short}{0.width_suffix}",
+            "ret": "f(32-64)",
+            "args": ["0", "0", "0"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_fnmsub_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "vfnmsub.{0.data_type_short}{0.width_suffix}",
+            "ret": "f(32-64)",
+            "args": ["0", "0", "0"]
+        }
+    ]
+}

From 0ce0cf1c87012dca1f21513566eab8b3210b029b Mon Sep 17 00:00:00 2001
From: Ruud van Asseldonk <ruuda@google.com>
Date: Sat, 5 Mar 2016 16:17:55 +0100
Subject: [PATCH 2/3] Update platform intrinsic generator script

The file it generates had been modified, but instead the generator
should have been modified, and the file regenerated. This merges the
modifications into the template in the generator.
---
 src/etc/platform-intrinsics/generator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/etc/platform-intrinsics/generator.py b/src/etc/platform-intrinsics/generator.py
index e3aa4e688d3..0e0d4841063 100644
--- a/src/etc/platform-intrinsics/generator.py
+++ b/src/etc/platform-intrinsics/generator.py
@@ -691,7 +691,7 @@ def parse_args():
     parser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout,
                         help = 'File to output to (default stdout).')
     parser.add_argument('-i', '--info', type=argparse.FileType('r'),
-                        help = 'File containing platform specific information to merge into'
+                        help = 'File containing platform specific information to merge into '
                                 'the input files\' header.')
     parser.add_argument('in_', metavar="FILE", type=argparse.FileType('r'), nargs='+',
                         help = 'JSON files to load')
@@ -735,12 +735,12 @@ class CompilerDefs(object):
 
 use {{Intrinsic, i, i_, u, u_, f, v, v_, agg, p, void}};
 use IntrinsicDef::Named;
-use rustc::middle::ty;
+use rustc::middle::ty::TyCtxt;
 
 // The default inlining settings trigger a pathological behaviour in
 // LLVM, which causes makes compilation very slow. See #28273.
 #[inline(never)]
-pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {{
+pub fn find<'tcx>(_tcx: &TyCtxt<'tcx>, name: &str) -> Option<Intrinsic> {{
     if !name.starts_with("{0}") {{ return None }}
     Some(match &name["{0}".len()..] {{'''.format(platform.intrinsic_prefix())
 

From a409076df4ac1e80d0e8b4ed55608cbd354129ef Mon Sep 17 00:00:00 2001
From: Ruud van Asseldonk <ruuda@google.com>
Date: Sat, 5 Mar 2016 16:25:58 +0100
Subject: [PATCH 3/3] Regenerate x86 platform intrinsics

The exact command used was:

    $ cd src/etc/platform-intrinsics/x86
    $ python2 ../generator.py --format compiler-defs -i info.json   \
      sse.json sse2.json sse3.json ssse3.json sse41.json sse42.json \
      avx.json avx2.json fma.json                                   \
      > ../../../librustc_platform_intrinsics/x86.rs
---
 src/librustc_platform_intrinsics/x86.rs | 120 ++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/src/librustc_platform_intrinsics/x86.rs b/src/librustc_platform_intrinsics/x86.rs
index 4a9b9970caf..168ae79ab74 100644
--- a/src/librustc_platform_intrinsics/x86.rs
+++ b/src/librustc_platform_intrinsics/x86.rs
@@ -1108,6 +1108,126 @@ pub fn find<'tcx>(_tcx: &TyCtxt<'tcx>, name: &str) -> Option<Intrinsic> {
             output: v(u(16), 16),
             definition: Named("llvm.x86.avx2.psubus.w")
         },
+        "_fmadd_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.fma.vfmadd.ps")
+        },
+        "_fmadd_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.fma.vfmadd.pd")
+        },
+        "256_fmadd_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), v(f(32), 8), v(f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.fma.vfmadd.ps.256")
+        },
+        "256_fmadd_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), v(f(64), 4), v(f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.fma.vfmadd.pd.256")
+        },
+        "_fmaddsub_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.fma.vfmaddsub.ps")
+        },
+        "_fmaddsub_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.fma.vfmaddsub.pd")
+        },
+        "256_fmaddsub_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), v(f(32), 8), v(f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.fma.vfmaddsub.ps.256")
+        },
+        "256_fmaddsub_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), v(f(64), 4), v(f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.fma.vfmaddsub.pd.256")
+        },
+        "_fmsub_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.fma.vfmsub.ps")
+        },
+        "_fmsub_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.fma.vfmsub.pd")
+        },
+        "256_fmsub_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), v(f(32), 8), v(f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.fma.vfmsub.ps.256")
+        },
+        "256_fmsub_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), v(f(64), 4), v(f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.fma.vfmsub.pd.256")
+        },
+        "_fmsubadd_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.fma.vfmsubadd.ps")
+        },
+        "_fmsubadd_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.fma.vfmsubadd.pd")
+        },
+        "256_fmsubadd_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), v(f(32), 8), v(f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.fma.vfmsubadd.ps.256")
+        },
+        "256_fmsubadd_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), v(f(64), 4), v(f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.fma.vfmsubadd.pd.256")
+        },
+        "_fnmadd_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.fma.vfnmadd.ps")
+        },
+        "_fnmadd_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.fma.vfnmadd.pd")
+        },
+        "256_fnmadd_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), v(f(32), 8), v(f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.fma.vfnmadd.ps.256")
+        },
+        "256_fnmadd_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), v(f(64), 4), v(f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.fma.vfnmadd.pd.256")
+        },
+        "_fnmsub_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.fma.vfnmsub.ps")
+        },
+        "_fnmsub_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.fma.vfnmsub.pd")
+        },
+        "256_fnmsub_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), v(f(32), 8), v(f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.fma.vfnmsub.ps.256")
+        },
+        "256_fnmsub_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), v(f(64), 4), v(f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.fma.vfnmsub.pd.256")
+        },
         _ => return None,
     })
 }