Auto merge of #28221 - huonw:simd, r=alexcrichton

The ARM equivalents of the AArch64 are annoyingly more complicated (and some of the AArch64 ones are too). I think I've got exposed all the x86 intrinsics from SSE to AVX2 now (at least, the ones that LLVM implements as callable intrinsics).
2015-09-05 02:15:41 +00:00 · 2015-09-05 02:15:41 +00:00 · 7ee876cb8e
parent 779b2a9847 67aa4c775a
commit 7ee876cb8e
13 changed files with 1418 additions and 112 deletions
--- a/src/etc/platform-intrinsics/aarch64.json
+++ b/src/etc/platform-intrinsics/aarch64.json
@ -336,6 +336,48 @@
            "ret": "i8",
            "args": ["0"]
        },
+        {
+            "intrinsic": "ld2{0[0].width}_{0[0].data_type}",
+            "width": [64, 128],
+            "llvm": "ld2.{0[0].llvm_name}.{1.llvm_name}",
+            "ret": ["[i(8-64);2]","[f(32-64);2]"],
+            "args": ["0.0SPc/0.0"]
+        },
+        {
+            "intrinsic": "ld3{0[0].width}_{0[0].data_type}",
+            "width": [64, 128],
+            "llvm": "ld3.{0[0].llvm_name}.{1.llvm_name}",
+            "ret": ["[i(8-64);3]","[f(32-64);3]"],
+            "args": ["0.0SPc/0.0"]
+        },
+        {
+            "intrinsic": "ld4{0[0].width}_{0[0].data_type}",
+            "width": [64, 128],
+            "llvm": "ld4.{0[0].llvm_name}.{1.llvm_name}",
+            "ret": ["[i(8-64);4]","[f(32-64);4]"],
+            "args": ["0.0SPc/0.0"]
+        },
+        {
+            "intrinsic": "ld2{0[0].width}_dup_{0[0].data_type}",
+            "width": [64, 128],
+            "llvm": "ld2.{0[0].llvm_name}.{1.llvm_name}",
+            "ret": ["[i(8-64);2]","[f(32-64);2]"],
+            "args": ["0.0SPc"]
+        },
+        {
+            "intrinsic": "ld3{0[0].width}_dup_{0[0].data_type}",
+            "width": [64, 128],
+            "llvm": "ld3.{0[0].llvm_name}.{1.llvm_name}",
+            "ret": ["[i(8-64);3]","[f(32-64);3]"],
+            "args": ["0.0SPc"]
+        },
+        {
+            "intrinsic": "ld4{0[0].width}_dup_{0[0].data_type}",
+            "width": [64, 128],
+            "llvm": "ld4.{0[0].llvm_name}.{1.llvm_name}",
+            "ret": ["[i(8-64);4]","[f(32-64);4]"],
+            "args": ["0.0SPc"]
+        },
        {
            "intrinsic": "padd{0.width}_{0.data_type}",
            "width": [64, 128],
--- a/src/etc/platform-intrinsics/generator.py
+++ b/src/etc/platform-intrinsics/generator.py
@ -14,11 +14,13 @@ import argparse
 import sys
 import re
 import textwrap
+import itertools

 SPEC = re.compile(
-    r'^(?:(?P<id>[iusfIUSF])(?:\((?P<start>\d+)-(?P<end>\d+)\)|'
+    r'^(?:(?P<void>V)|(?P<id>[iusfIUSF])(?:\((?P<start>\d+)-(?P<end>\d+)\)|'
    r'(?P<width>\d+)(:?/(?P<llvm_width>\d+))?)'
-    r'|(?P<reference>\d+)(?P<modifiers>[vShdnwus]*)(?P<force_width>x\d+)?)$'
+    r'|(?P<reference>\d+))(?P<index>\.\d+)?(?P<modifiers>[vShdnwusfDMC]*)(?P<force_width>x\d+)?'
+    r'(?:(?P<pointer>Pm|Pc)(?P<llvm_pointer>/.*)?|(?P<bitcast>->.*))?$'
 )

 class PlatformInfo(object):
@ -68,18 +70,35 @@ class IntrinsicSet(object):
                                {k: lookup(v) for k, v in data.items()})

 class PlatformTypeInfo(object):
-    def __init__(self, llvm_name, properties):
-        self.properties = properties
-        self.llvm_name = llvm_name
+    def __init__(self, llvm_name, properties, elems = None):
+        if elems is None:
+            self.properties = properties
+            self.llvm_name = llvm_name
+        else:
+            assert properties is None and llvm_name is None
+            self.properties = {}
+            self.elems = elems
+
+    def __repr__(self):
+        return '<PlatformTypeInfo {}, {}>'.format(self.llvm_name, self.properties)

    def __getattr__(self, name):
        return self.properties[name]

+    def __getitem__(self, idx):
+        return self.elems[idx]
+
    def vectorize(self, length, width_info):
        props = self.properties.copy()
        props.update(width_info)
        return PlatformTypeInfo('v{}{}'.format(length, self.llvm_name), props)

+    def pointer(self, llvm_elem):
+        name = self.llvm_name if llvm_elem is None else llvm_elem.llvm_name
+        return PlatformTypeInfo('p0{}'.format(name), self.properties)
+
+BITWIDTH_POINTER = '<pointer>'
+
 class Type(object):
    def __init__(self, bitwidth):
        self._bitwidth = bitwidth
@ -87,18 +106,39 @@ class Type(object):
    def bitwidth(self):
        return self._bitwidth

-    def modify(self, spec, width):
+    def modify(self, spec, width, previous):
        raise NotImplementedError()

+    def __ne__(self, other):
+        return not (self == other)
+
+class Void(Type):
+    def __init__(self):
+        Type.__init__(self, 0)
+
+    def compiler_ctor(self):
+        return 'void()'
+
+    def rust_name(self):
+        return '()'
+
+    def type_info(self, platform_info):
+        return None
+
+    def __eq__(self, other):
+        return isinstance(other, Void)
+
 class Number(Type):
    def __init__(self, bitwidth):
        Type.__init__(self, bitwidth)

-    def modify(self, spec, width):
+    def modify(self, spec, width, previous):
        if spec == 'u':
            return Unsigned(self.bitwidth())
        elif spec == 's':
            return Signed(self.bitwidth())
+        elif spec == 'f':
+            return Float(self.bitwidth())
        elif spec == 'w':
            return self.__class__(self.bitwidth() * 2)
        elif spec == 'n':
@ -111,11 +151,16 @@ class Number(Type):
    def type_info(self, platform_info):
        return platform_info.number_type_info(self)

+    def __eq__(self, other):
+        # print(self, other)
+        return self.__class__ == other.__class__ and self.bitwidth() == other.bitwidth()
+
 class Signed(Number):
    def __init__(self, bitwidth, llvm_bitwidth = None):
        Number.__init__(self, bitwidth)
        self._llvm_bitwidth = llvm_bitwidth

+
    def compiler_ctor(self):
        if self._llvm_bitwidth is None:
            return 'i({})'.format(self.bitwidth())
@ -164,26 +209,47 @@ class Float(Number):
        return 'f{}'.format(self.bitwidth())

 class Vector(Type):
-    def __init__(self, elem, length):
+    def __init__(self, elem, length, bitcast = None):
        assert isinstance(elem, Type) and not isinstance(elem, Vector)
        Type.__init__(self,
                      elem.bitwidth() * length)
        self._length = length
        self._elem = elem
+        assert bitcast is None or (isinstance(bitcast, Vector) and
+                                   bitcast._bitcast is None and
+                                   bitcast._elem.bitwidth() == elem.bitwidth())
+        if bitcast is not None and bitcast._elem != elem:
+            self._bitcast = bitcast._elem
+        else:
+            self._bitcast = None

-    def modify(self, spec, width):
-        if spec == 'h':
+    def modify(self, spec, width, previous):
+        if spec == 'S':
+            return self._elem
+        elif spec == 'h':
            return Vector(self._elem, self._length // 2)
        elif spec == 'd':
            return Vector(self._elem, self._length * 2)
        elif spec.startswith('x'):
            new_bitwidth = int(spec[1:])
            return Vector(self._elem, new_bitwidth // self._elem.bitwidth())
+        elif spec.startswith('->'):
+            bitcast_to = TypeSpec(spec[2:])
+            choices = list(bitcast_to.enumerate(width, previous))
+            assert len(choices) == 1
+            bitcast_to = choices[0]
+            return Vector(self._elem, self._length, bitcast_to)
        else:
-            return Vector(self._elem.modify(spec, width), self._length)
+            return Vector(self._elem.modify(spec, width, previous), self._length)

    def compiler_ctor(self):
-        return 'v({}, {})'.format(self._elem.compiler_ctor(), self._length)
+        if self._bitcast is None:
+            return 'v({}, {})'.format(self._elem.compiler_ctor(),
+                                      self._length)
+        else:
+            return 'v_({}, {}, {})'.format(self._elem.compiler_ctor(),
+                                           self._bitcast.compiler_ctor(),
+                                           self._length)

    def rust_name(self):
        return '{}x{}'.format(self._elem.rust_name(), self._length)
@ -193,6 +259,51 @@ class Vector(Type):
        return elem_info.vectorize(self._length,
                                   platform_info.width_info(self.bitwidth()))

+    def __eq__(self, other):
+        return isinstance(other, Vector) and self._length == other._length and \
+            self._elem == other._elem and self._bitcast == other._bitcast
+
+class Pointer(Type):
+    def __init__(self, elem, llvm_elem, const):
+        self._elem = elem;
+        self._llvm_elem = llvm_elem
+        self._const = const
+        Type.__init__(self, BITWIDTH_POINTER)
+
+    def modify(self, spec, width, previous):
+        if spec == 'D':
+            return self._elem
+        elif spec == 'M':
+            return Pointer(self._elem, self._llvm_elem, False)
+        elif spec == 'C':
+            return Pointer(self._elem, self._llvm_elem, True)
+        else:
+            return Pointer(self._elem.modify(spec, width, previous), self._llvm_elem, self._const)
+
+    def compiler_ctor(self):
+        if self._llvm_elem is None:
+            llvm_elem = 'None'
+        else:
+            llvm_elem = 'Some({})'.format(self._llvm_elem.compiler_ctor())
+        return 'p({}, {}, {})'.format('true' if self._const else 'false',
+                                      self._elem.compiler_ctor(),
+                                      llvm_elem)
+
+    def rust_name(self):
+        return '*{} {}'.format('const' if self._const else 'mut',
+                               self._elem.rust_name())
+
+    def type_info(self, platform_info):
+        if self._llvm_elem is None:
+            llvm_elem = None
+        else:
+            llvm_elem = self._llvm_elem.type_info(platform_info)
+        return self._elem.type_info(platform_info).pointer(llvm_elem)
+
+    def __eq__(self, other):
+        return isinstance(other, Pointer) and self._const == other._const \
+            and self._elem == other._elem and self._llvm_elem == other._llvm_elem
+
 class Aggregate(Type):
    def __init__(self, flatten, elems):
        self._flatten = flatten
@ -202,6 +313,14 @@ class Aggregate(Type):
    def __repr__(self):
        return '<Aggregate {}>'.format(self._elems)

+    def modify(self, spec, width, previous):
+        if spec.startswith('.'):
+            num = int(spec[1:])
+            return self._elems[num]
+        else:
+            print(spec)
+            raise NotImplementedError()
+
    def compiler_ctor(self):
        return 'agg({}, vec![{}])'.format('true' if self._flatten else 'false',
                                          ', '.join(elem.compiler_ctor() for elem in self._elems))
@ -210,8 +329,11 @@ class Aggregate(Type):
        return '({})'.format(', '.join(elem.rust_name() for elem in self._elems))

    def type_info(self, platform_info):
-        #return PlatformTypeInfo(None, None, self._llvm_name)
-        return None
+        return PlatformTypeInfo(None, None, [elem.type_info(platform_info) for elem in self._elems])
+
+    def __eq__(self, other):
+        return isinstance(other, Aggregate) and self._flatten == other._flatten and \
+            self._elems == other._elems


 TYPE_ID_LOOKUP = {'i': [Signed, Unsigned],
@ -219,6 +341,22 @@ TYPE_ID_LOOKUP = {'i': [Signed, Unsigned],
                  'u': [Unsigned],
                  'f': [Float]}

+def ptrify(match, elem, width, previous):
+    ptr = match.group('pointer')
+    if ptr is None:
+        return elem
+    else:
+        llvm_ptr = match.group('llvm_pointer')
+        if llvm_ptr is None:
+            llvm_elem = None
+        else:
+            assert llvm_ptr.startswith('/')
+            options = list(TypeSpec(llvm_ptr[1:]).enumerate(width, previous))
+            assert len(options) == 1
+            llvm_elem = options[0]
+        assert ptr in ('Pc', 'Pm')
+        return Pointer(elem, llvm_elem, ptr == 'Pc')
+
 class TypeSpec(object):
    def __init__(self, spec):
        if not isinstance(spec, list):
@ -226,71 +364,103 @@ class TypeSpec(object):

        self.spec = spec

-    def enumerate(self, width):
+    def enumerate(self, width, previous):
        for spec in self.spec:
            match = SPEC.match(spec)
-            if match:
+            if match is not None:
                id = match.group('id')
-                is_vector = id.islower()
-                type_ctors = TYPE_ID_LOOKUP[id.lower()]
+                reference = match.group('reference')

-                start = match.group('start')
-                if start is not None:
-                    end = match.group('end')
-                    llvm_width = None
+                modifiers = []
+                index = match.group('index')
+                if index is not None:
+                    modifiers.append(index)
+                modifiers += list(match.group('modifiers') or '')
+                force = match.group('force_width')
+                if force is not None:
+                    modifiers.append(force)
+                bitcast = match.group('bitcast')
+                if bitcast is not None:
+                    modifiers.append(bitcast)
+
+                if match.group('void') is not None:
+                    assert spec == 'V'
+                    yield Void()
+                elif id is not None:
+                    is_vector = id.islower()
+                    type_ctors = TYPE_ID_LOOKUP[id.lower()]
+
+                    start = match.group('start')
+                    if start is not None:
+                        end = match.group('end')
+                        llvm_width = None
+                    else:
+                        start = end = match.group('width')
+                        llvm_width = match.group('llvm_width')
+                    start = int(start)
+                    end = int(end)
+
+                    bitwidth = start
+                    while bitwidth <= end:
+                        for ctor in type_ctors:
+                            if llvm_width is not None:
+                                assert not is_vector
+                                llvm_width = int(llvm_width)
+                                assert llvm_width < bitwidth
+                                scalar = ctor(bitwidth, llvm_width)
+                            else:
+                                scalar = ctor(bitwidth)
+
+                            if is_vector:
+                                elem = Vector(scalar, width // bitwidth)
+                            else:
+                                assert bitcast is None
+                                elem = scalar
+
+                            for x in modifiers:
+                                elem = elem.modify(x, width, previous)
+                            yield ptrify(match, elem, width, previous)
+                        bitwidth *= 2
+                elif reference is not None:
+                    reference = int(reference)
+                    assert reference < len(previous), \
+                        'referring to argument {}, but only {} are known'.format(reference,
+                                                                                 len(previous))
+                    ret = previous[reference]
+                    for x in modifiers:
+                        ret = ret.modify(x, width, previous)
+                    yield ptrify(match, ret, width, previous)
                else:
-                    start = end = match.group('width')
-                    llvm_width = match.group('llvm_width')
-                start = int(start)
-                end = int(end)
+                    assert False, 'matched `{}`, but didn\'t understand it?'.format(spec)
+            elif spec.startswith('('):
+                if spec.endswith(')'):
+                    true_spec = spec[1:-1]
+                    flatten = False
+                elif spec.endswith(')f'):
+                    true_spec = spec[1:-2]
+                    flatten = True
+                else:
+                    assert False, 'found unclosed aggregate `{}`'.format(spec)

-                bitwidth = start
-                while bitwidth <= end:
-                    for ctor in type_ctors:
-                        if llvm_width is not None:
-                            assert not is_vector
-                            llvm_width = int(llvm_width)
-                            assert llvm_width < bitwidth
-                            scalar = ctor(bitwidth, llvm_width)
-                        else:
-                            scalar = ctor(bitwidth)
+                for elems in itertools.product(*(TypeSpec(subspec).enumerate(width, previous)
+                                                 for subspec in true_spec.split(','))):
+                    yield Aggregate(flatten, elems)
+            elif spec.startswith('['):
+                if spec.endswith(']'):
+                    true_spec = spec[1:-1]
+                    flatten = False
+                elif spec.endswith(']f'):
+                    true_spec = spec[1:-2]
+                    flatten = True
+                else:
+                    assert False, 'found unclosed aggregate `{}`'.format(spec)
+                elem_spec, count = true_spec.split(';')

-                        if is_vector:
-                            yield Vector(scalar, width // bitwidth)
-                        else:
-                            yield scalar
-                    bitwidth *= 2
+                count = int(count)
+                for elem in TypeSpec(elem_spec).enumerate(width, previous):
+                    yield Aggregate(flatten, [elem] * count)
            else:
-                print('Failed to parse: `{}`'.format(spec), file=sys.stderr)
-
-    def resolve(self, width, zero):
-        assert len(self.spec) == 1
-        spec = self.spec[0]
-        match = SPEC.match(spec)
-        if match:
-            id  = match.group('id')
-            if id is not None:
-                options = list(self.enumerate(width))
-                assert len(options) == 1
-                return options[0]
-            reference = match.group('reference')
-            if reference != '0':
-                raise NotImplementedError('only argument 0 (return value) references are supported')
-            ret = zero
-            for x in match.group('modifiers') or []:
-                ret = ret.modify(x, width)
-            force = match.group('force_width')
-            if force is not None:
-                ret = ret.modify(force, width)
-            return ret
-        elif spec.startswith('('):
-            if spec.endswith(')'):
-                raise NotImplementedError()
-            elif spec.endswith(')f'):
-                true_spec = spec[1:-2]
-                flatten = True
-            elems = [TypeSpec(subspec).resolve(width, zero) for subspec in true_spec.split(',')]
-            return Aggregate(flatten, elems)
+                assert False, 'Failed to parse `{}`'.format(spec)

 class GenericIntrinsic(object):
    def __init__(self, platform, intrinsic, widths, llvm_name, ret, args):
@ -305,10 +475,22 @@ class GenericIntrinsic(object):
        for width in self.widths:
            # must be a power of two
            assert width & (width - 1) == 0
-            for ret in self.ret.enumerate(width):
-                args = [arg.resolve(width, ret) for arg in self.args]
-                yield MonomorphicIntrinsic(self._platform, self.intrinsic, width, self.llvm_name,
-                                           ret, args)
+            def recur(processed, untouched):
+                if untouched == []:
+                    ret = processed[0]
+                    args = processed[1:]
+                    yield MonomorphicIntrinsic(self._platform, self.intrinsic, width,
+                                               self.llvm_name,
+                                               ret, args)
+                else:
+                    raw_arg = untouched[0]
+                    rest = untouched[1:]
+                    for arg in raw_arg.enumerate(width, processed):
+                        for intr in recur(processed + [arg], rest):
+                            yield intr
+
+            for x in recur([], [self.ret] + self.args):
+                yield x

 class MonomorphicIntrinsic(object):
    def __init__(self, platform, intrinsic, width, llvm_name, ret, args):
@ -369,7 +551,18 @@ def parse_args():
        ## Type specifier grammar

        ```
-        type := vector | scalar | aggregate | reference
+        type := core_type modifier* suffix?
+
+        core_type := void | vector | scalar | aggregate | reference
+
+        modifier := 'v' | 'h' | 'd' | 'n' | 'w' | 'u' | 's' |
+                     'x' number | '.' number
+        suffix := pointer | bitcast
+        pointer := 'Pm' llvm_pointer? | 'Pc' llvm_pointer?
+        llvm_pointer := '/' type
+        bitcast := '->' type
+
+        void := 'V'

        vector := vector_elem width |
        vector_elem := 'i' | 'u' | 's' | 'f'
@ -378,18 +571,20 @@ def parse_args():
        scalar_type := 'U' | 'S' | 'F'
        llvm_width := '/' number

-        aggregate := '(' (type),* ')' 'f'?
-
-        reference := number modifiers*
-        modifiers := 'v' | 'h' | 'd' | 'n' | 'w' | 'u' | 's' |
-                     'x' number
+        aggregate := '(' (type),* ')' 'f'? | '[' type ';' number ']' 'f'?

+        reference := number

        width = number | '(' number '-' number ')'

        number = [0-9]+
        ```

+        ## Void
+
+        The `V` type corresponds to `void` in LLVM (`()` in
+        Rust). It's likely to only work in return position.
+
        ## Vectors

        The vector grammar is a pattern describing many possibilities
@ -433,6 +628,12 @@ def parse_args():
        - no `f` corresponds to `declare ... @llvm.foo({float, i32})`.
        - having an `f` corresponds to `declare ... @llvm.foo(float, i32)`.

+        The `[type;number]` form is a just shorter way to write
+        `(...)`, except avoids doing a cartesian product of generic
+        types, e.g. `[S32;2]` is the same as `(S32, S32)`, while
+        `[I32;2]` is describing just the two types `(S32,S32)` and
+        `(U32,U32)` (i.e. doesn't include `(S32,U32)`, `(U32,S32)` as
+        `(I32,I32)` would).

        (Currently aggregates can not contain other aggregates.)

@ -441,19 +642,49 @@ def parse_args():
        A reference uses the type of another argument, with possible
        modifications. The number refers to the type to use, starting
        with 0 == return value, 1 == first argument, 2 == second
-        argument, etc. (Currently only referencing 0, the return
-        value, is supported.)
+        argument, etc.
+
+        ## Affixes
+
+        The `modifier` and `suffix` adaptors change the precise
+        representation.

        ### Modifiers

        - 'v': put a scalar into a vector of the current width (u32 -> u32x4, when width == 128)
+        - 'S': get the scalar element of a vector (u32x4 -> u32)
        - 'h': half the length of the vector (u32x4 -> u32x2)
        - 'd': double the length of the vector (u32x2 -> u32x4)
        - 'n': narrow the element of the vector (u32x4 -> u16x4)
        - 'w': widen the element of the vector (u16x4 -> u32x4)
-        - 'u': force an integer (vector or scalar) to be unsigned (i32x4 -> u32x4)
-        - 's': force an integer (vector or scalar) to be signed (u32x4 -> i32x4)
+        - 'u': force a number (vector or scalar) to be unsigned int (f32x4 -> u32x4)
+        - 's': force a number (vector or scalar) to be signed int (u32x4 -> i32x4)
+        - 'f': force a number (vector or scalar) to be float (u32x4 -> f32x4)
        - 'x' number: force the type to be a vector of bitwidth `number`.
+        - '.' number: get the `number`th element of an aggregate
+        - 'D': dereference a pointer (*mut u32 -> u32)
+        - 'C': make a pointer const (*mut u32 -> *const u32)
+        - 'M': make a pointer mut (*const u32 -> *mut u32)
+
+        ### Pointers
+
+        Pointers can be created of any type by appending a `P*`
+        suffix. The `m` vs. `c` chooses mut vs. const. e.g. `S32Pm`
+        corresponds to `*mut i32`, and `i32Pc` corresponds (with width
+        128) to `*const i8x16`, `*const u32x4`, etc.
+
+        The type after the `/` (optional) represents the type used
+        internally to LLVM, e.g. `S32pm/S8` is exposed as `*mut i32`
+        in Rust, but is `i8*` in LLVM. (This defaults to the main
+        type).
+
+        ### Bitcast
+
+        The `'->' type` bitcast suffix will cause the value to be
+        bitcast to the right-hand type when calling the intrinsic,
+        e.g. `s32->f32` will expose the intrinsic as `i32x4` at the
+        Rust level, but will cast that vector to `f32x4` when calling
+        the LLVM intrinsic.
        '''))
    parser.add_argument('--format', choices=FORMATS, required=True,
                        help = 'Output format.')
@ -502,7 +733,7 @@ class CompilerDefs(object):

 #![allow(unused_imports)]

-use {{Intrinsic, i, i_, u, u_, f, v, agg}};
+use {{Intrinsic, i, i_, u, u_, f, v, v_, agg, p, void}};
 use IntrinsicDef::Named;
 use rustc::middle::ty;

--- a/src/etc/platform-intrinsics/x86/avx.json
+++ b/src/etc/platform-intrinsics/x86/avx.json
@ -36,6 +36,20 @@
            "ret": "f(32-64)",
            "args": ["0", "0"]
        },
+        {
+            "intrinsic": "{0.width_mm}_maskload_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "maskload.{0.data_type_short}{0.width_suffix}",
+            "ret": ["f(32-64)"],
+            "args": ["0SPc/S8", "0s->0"]
+        },
+        {
+            "intrinsic": "{3.width_mm}_maskstore_{3.data_type}",
+            "width": [128, 256],
+            "llvm": "maskstore.{3.data_type_short}{3.width_suffix}",
+            "ret": "V",
+            "args": ["F(32-64)Pm/S8", "1Dsv->1Dv", "1Dv"]
+        },
        {
            "intrinsic": "256_min_{0.data_type}",
            "width": [256],
@ -78,6 +92,20 @@
            "ret": "f32",
            "args": ["f32"]
        },
+        {
+            "intrinsic": "256_storeu_{2.data_type}",
+            "width": [256],
+            "llvm": "storeu.ps.256",
+            "ret": "V",
+            "args": ["f(32-64)Pm/U8", "1D"]
+        },
+        {
+            "intrinsic": "256_storeu_si256",
+            "width": [256],
+            "llvm": "storeu.dq.256",
+            "ret": "V",
+            "args": ["u8Pm/U8", "1D"]
+        },
        {
            "intrinsic": "256_sqrt_{0.data_type}",
            "width": [256],
@ -147,6 +175,20 @@
            "llvm": "ptestz.256",
            "ret": "S32",
            "args": ["u64", "u64"]
+        },
+        {
+            "intrinsic": "256_zeroall",
+            "width": [256],
+            "llvm": "vzeroall",
+            "ret": "V",
+            "args": []
+        },
+        {
+            "intrinsic": "256_zeroupper",
+            "width": [256],
+            "llvm": "vzeroupper",
+            "ret": "V",
+            "args": []
        }
    ]
 }
--- a/src/etc/platform-intrinsics/x86/avx2.json
+++ b/src/etc/platform-intrinsics/x86/avx2.json
@ -4,21 +4,21 @@
        {
            "intrinsic": "256_abs_{0.data_type}",
            "width": [256],
-            "llvm": "avx2.pabs.{0.data_type_short}",
+            "llvm": "pabs.{0.data_type_short}",
            "ret": "s(8-32)",
            "args": ["0"]
        },
        {
            "intrinsic": "256_adds_{0.data_type}",
            "width": [256],
-            "llvm": "avx2.padd{0.kind_short}s.{0.data_type_short}",
+            "llvm": "padd{0.kind_short}s.{0.data_type_short}",
            "ret": "i(8-16)",
            "args": ["0", "0"]
        },
        {
            "intrinsic": "256_avg_{0.data_type}",
            "width": [256],
-            "llvm": "avx2.pavg.{0.data_type_short}",
+            "llvm": "pavg.{0.data_type_short}",
            "ret": "u(8-16)",
            "args": ["0", "0"]
        },
@ -64,6 +64,48 @@
            "ret": "s16",
            "args": ["s8", "s8"]
        },
+        {
+            "intrinsic": "{0.width_mm}_mask_i32gather_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "gather.d.{0.data_type_short}{0.width_suffix}",
+            "ret": ["s32", "f32"],
+            "args": ["0", "0SPc/S8", "s32", "0s->0", "S32/8"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_mask_i32gather_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "gather.d.{0.data_type_short}{0.width_suffix}",
+            "ret": ["s64", "f64"],
+            "args": ["0", "0SPc/S8", "s32x128", "0s->0", "S32/8"]
+        },
+        {
+            "intrinsic": "{3.width_mm}_mask_i64gather_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "gather.q.{0.data_type_short}{0.width_suffix}",
+            "ret": ["s32x128", "f32x128"],
+            "args": ["0", "0SPc/S8", "s64", "0s->0", "S32/8"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_mask_i64gather_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "gather.q.{0.data_type_short}{0.width_suffix}",
+            "ret": ["s64", "f64"],
+            "args": ["0", "0SPc/S8", "s64", "0s->0", "S32/8"]
+        },
+        {
+            "intrinsic": "{0.width_mm}_maskload_{0.data_type}",
+            "width": [128, 256],
+            "llvm": "maskload.{0.data_type_short}{0.width_suffix}",
+            "ret": ["s(32-64)"],
+            "args": ["0Pc/S8", "0"]
+        },
+        {
+            "intrinsic": "{2.width_mm}_maskstore_{2.data_type}",
+            "width": [128, 256],
+            "llvm": "maskstore.{2.data_type_short}{2.width_suffix}",
+            "ret": "V",
+            "args": ["S(32-64)Pm/S8", "1Dv", "2"]
+        },
        {
            "intrinsic": "256_max_{0.data_type}",
            "width": [256],
--- a/src/etc/platform-intrinsics/x86/sse.json
+++ b/src/etc/platform-intrinsics/x86/sse.json
@ -42,6 +42,13 @@
            "llvm": "!llvm.sqrt.v4f32",
            "ret": "f32",
            "args": ["0"]
+        },
+        {
+            "intrinsic": "_storeu_ps",
+            "width": [128],
+            "llvm": "storeu.ps",
+            "ret": "V",
+            "args": ["F32Pm/S8", "f32"]
        }
    ]
 }
--- a/src/etc/platform-intrinsics/x86/sse2.json
+++ b/src/etc/platform-intrinsics/x86/sse2.json
@ -15,6 +15,13 @@
            "ret": "u(8-16)",
            "args": ["0", "0"]
        },
+        {
+            "intrinsic": "_lfence",
+            "width": [128],
+            "llvm": "lfence",
+            "ret": "V",
+            "args": []
+        },
        {
            "intrinsic": "_madd_epi16",
            "width": [128],
@ -22,6 +29,13 @@
            "ret": "s32",
            "args": ["s16", "s16"]
        },
+        {
+            "intrinsic": "_maskmoveu_si128",
+            "width": [128],
+            "llvm": "maskmov.dqu",
+            "ret": "V",
+            "args": ["u8", "u8", "U8Pm"]
+        },
        {
            "intrinsic": "_max_{0.data_type}",
            "width": [128],
@ -36,6 +50,13 @@
            "ret": "f64",
            "args": ["0", "0"]
        },
+        {
+            "intrinsic": "_mfence",
+            "width": [128],
+            "llvm": "fence",
+            "ret": "V",
+            "args": []
+        },
        {
            "intrinsic": "_min_{0.data_type}",
            "width": [128],
@ -99,6 +120,13 @@
            "ret": "u64",
            "args": ["u8", "u8"]
        },
+        {
+            "intrinsic": "_sfence",
+            "width": [128],
+            "llvm": "sfence",
+            "ret": "V",
+            "args": []
+        },
        {
            "intrinsic": "_sqrt_pd",
            "width": [128],
@ -106,6 +134,20 @@
            "ret": "f64",
            "args": ["0"]
        },
+        {
+            "intrinsic": "_storeu_pd",
+            "width": [128],
+            "llvm": "storeu.pd",
+            "ret": "V",
+            "args": ["F64Pm/U8", "f64"]
+        },
+        {
+            "intrinsic": "_storeu_si128",
+            "width": [128],
+            "llvm": "storeu.dq",
+            "ret": "V",
+            "args": ["u8Pm/U8", "u8"]
+        },
        {
            "intrinsic": "_subs_{0.data_type}",
            "width": [128],
--- a/src/etc/platform-intrinsics/x86/sse3.json
+++ b/src/etc/platform-intrinsics/x86/sse3.json
@ -21,6 +21,13 @@
            "llvm": "hsub.{0.data_type}",
            "ret": "f(32-64)",
            "args": ["0", "0"]
+        },
+        {
+            "intrinsic": "_lddqu_si128",
+            "width": [128],
+            "llvm": "ldu.dq",
+            "ret": "u8",
+            "args": ["0Pc/S8"]
        }
    ]
 }
--- a/src/librustc_platform_intrinsics/aarch64.rs
+++ b/src/librustc_platform_intrinsics/aarch64.rs
@ -13,7 +13,7 @@

 #![allow(unused_imports)]

-use {Intrinsic, i, u, f, v, agg};
+use {Intrinsic, i, i_, u, u_, f, v, v_, agg, p, void};
 use IntrinsicDef::Named;
 use rustc::middle::ty;

@ -1910,6 +1910,606 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(u(8), 16),
            definition: Named("llvm.aarch64.neon.rbit.v16i8")
        },
+        "ld2_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), Some(v(i(8), 8)))],
+            output: agg(false, vec![v(i(8), 8), v(i(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i8.p0v8i8")
+        },
+        "ld2_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), Some(v(u(8), 8)))],
+            output: agg(false, vec![v(u(8), 8), v(u(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i8.p0v8i8")
+        },
+        "ld2_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), Some(v(i(16), 4)))],
+            output: agg(false, vec![v(i(16), 4), v(i(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i16.p0v4i16")
+        },
+        "ld2_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), Some(v(u(16), 4)))],
+            output: agg(false, vec![v(u(16), 4), v(u(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i16.p0v4i16")
+        },
+        "ld2_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), Some(v(i(32), 2)))],
+            output: agg(false, vec![v(i(32), 2), v(i(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i32.p0v2i32")
+        },
+        "ld2_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), Some(v(u(32), 2)))],
+            output: agg(false, vec![v(u(32), 2), v(u(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i32.p0v2i32")
+        },
+        "ld2_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), Some(v(i(64), 1)))],
+            output: agg(false, vec![v(i(64), 1), v(i(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld2.v1i64.p0v1i64")
+        },
+        "ld2_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), Some(v(u(64), 1)))],
+            output: agg(false, vec![v(u(64), 1), v(u(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld2.v1i64.p0v1i64")
+        },
+        "ld2_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(v(f(32), 2)))],
+            output: agg(false, vec![v(f(32), 2), v(f(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2f32.p0v2f32")
+        },
+        "ld2_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(v(f(64), 1)))],
+            output: agg(false, vec![v(f(64), 1), v(f(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld2.v1f64.p0v1f64")
+        },
+        "ld2q_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), Some(v(i(8), 16)))],
+            output: agg(false, vec![v(i(8), 16), v(i(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld2.v16i8.p0v16i8")
+        },
+        "ld2q_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), Some(v(u(8), 16)))],
+            output: agg(false, vec![v(u(8), 16), v(u(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld2.v16i8.p0v16i8")
+        },
+        "ld2q_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), Some(v(i(16), 8)))],
+            output: agg(false, vec![v(i(16), 8), v(i(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i16.p0v8i16")
+        },
+        "ld2q_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), Some(v(u(16), 8)))],
+            output: agg(false, vec![v(u(16), 8), v(u(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i16.p0v8i16")
+        },
+        "ld2q_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), Some(v(i(32), 4)))],
+            output: agg(false, vec![v(i(32), 4), v(i(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i32.p0v4i32")
+        },
+        "ld2q_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), Some(v(u(32), 4)))],
+            output: agg(false, vec![v(u(32), 4), v(u(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i32.p0v4i32")
+        },
+        "ld2q_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), Some(v(i(64), 2)))],
+            output: agg(false, vec![v(i(64), 2), v(i(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i64.p0v2i64")
+        },
+        "ld2q_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), Some(v(u(64), 2)))],
+            output: agg(false, vec![v(u(64), 2), v(u(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i64.p0v2i64")
+        },
+        "ld2q_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(v(f(32), 4)))],
+            output: agg(false, vec![v(f(32), 4), v(f(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4f32.p0v4f32")
+        },
+        "ld2q_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(v(f(64), 2)))],
+            output: agg(false, vec![v(f(64), 2), v(f(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2f64.p0v2f64")
+        },
+        "ld3_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), Some(v(i(8), 8)))],
+            output: agg(false, vec![v(i(8), 8), v(i(8), 8), v(i(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i8.p0v8i8")
+        },
+        "ld3_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), Some(v(u(8), 8)))],
+            output: agg(false, vec![v(u(8), 8), v(u(8), 8), v(u(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i8.p0v8i8")
+        },
+        "ld3_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), Some(v(i(16), 4)))],
+            output: agg(false, vec![v(i(16), 4), v(i(16), 4), v(i(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i16.p0v4i16")
+        },
+        "ld3_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), Some(v(u(16), 4)))],
+            output: agg(false, vec![v(u(16), 4), v(u(16), 4), v(u(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i16.p0v4i16")
+        },
+        "ld3_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), Some(v(i(32), 2)))],
+            output: agg(false, vec![v(i(32), 2), v(i(32), 2), v(i(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i32.p0v2i32")
+        },
+        "ld3_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), Some(v(u(32), 2)))],
+            output: agg(false, vec![v(u(32), 2), v(u(32), 2), v(u(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i32.p0v2i32")
+        },
+        "ld3_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), Some(v(i(64), 1)))],
+            output: agg(false, vec![v(i(64), 1), v(i(64), 1), v(i(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld3.v1i64.p0v1i64")
+        },
+        "ld3_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), Some(v(u(64), 1)))],
+            output: agg(false, vec![v(u(64), 1), v(u(64), 1), v(u(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld3.v1i64.p0v1i64")
+        },
+        "ld3_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(v(f(32), 2)))],
+            output: agg(false, vec![v(f(32), 2), v(f(32), 2), v(f(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2f32.p0v2f32")
+        },
+        "ld3_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(v(f(64), 1)))],
+            output: agg(false, vec![v(f(64), 1), v(f(64), 1), v(f(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld3.v1f64.p0v1f64")
+        },
+        "ld3q_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), Some(v(i(8), 16)))],
+            output: agg(false, vec![v(i(8), 16), v(i(8), 16), v(i(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld3.v16i8.p0v16i8")
+        },
+        "ld3q_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), Some(v(u(8), 16)))],
+            output: agg(false, vec![v(u(8), 16), v(u(8), 16), v(u(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld3.v16i8.p0v16i8")
+        },
+        "ld3q_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), Some(v(i(16), 8)))],
+            output: agg(false, vec![v(i(16), 8), v(i(16), 8), v(i(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i16.p0v8i16")
+        },
+        "ld3q_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), Some(v(u(16), 8)))],
+            output: agg(false, vec![v(u(16), 8), v(u(16), 8), v(u(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i16.p0v8i16")
+        },
+        "ld3q_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), Some(v(i(32), 4)))],
+            output: agg(false, vec![v(i(32), 4), v(i(32), 4), v(i(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i32.p0v4i32")
+        },
+        "ld3q_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), Some(v(u(32), 4)))],
+            output: agg(false, vec![v(u(32), 4), v(u(32), 4), v(u(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i32.p0v4i32")
+        },
+        "ld3q_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), Some(v(i(64), 2)))],
+            output: agg(false, vec![v(i(64), 2), v(i(64), 2), v(i(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i64.p0v2i64")
+        },
+        "ld3q_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), Some(v(u(64), 2)))],
+            output: agg(false, vec![v(u(64), 2), v(u(64), 2), v(u(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i64.p0v2i64")
+        },
+        "ld3q_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(v(f(32), 4)))],
+            output: agg(false, vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4f32.p0v4f32")
+        },
+        "ld3q_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(v(f(64), 2)))],
+            output: agg(false, vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2f64.p0v2f64")
+        },
+        "ld4_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), Some(v(i(8), 8)))],
+            output: agg(false, vec![v(i(8), 8), v(i(8), 8), v(i(8), 8), v(i(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i8.p0v8i8")
+        },
+        "ld4_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), Some(v(u(8), 8)))],
+            output: agg(false, vec![v(u(8), 8), v(u(8), 8), v(u(8), 8), v(u(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i8.p0v8i8")
+        },
+        "ld4_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), Some(v(i(16), 4)))],
+            output: agg(false, vec![v(i(16), 4), v(i(16), 4), v(i(16), 4), v(i(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i16.p0v4i16")
+        },
+        "ld4_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), Some(v(u(16), 4)))],
+            output: agg(false, vec![v(u(16), 4), v(u(16), 4), v(u(16), 4), v(u(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i16.p0v4i16")
+        },
+        "ld4_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), Some(v(i(32), 2)))],
+            output: agg(false, vec![v(i(32), 2), v(i(32), 2), v(i(32), 2), v(i(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i32.p0v2i32")
+        },
+        "ld4_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), Some(v(u(32), 2)))],
+            output: agg(false, vec![v(u(32), 2), v(u(32), 2), v(u(32), 2), v(u(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i32.p0v2i32")
+        },
+        "ld4_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), Some(v(i(64), 1)))],
+            output: agg(false, vec![v(i(64), 1), v(i(64), 1), v(i(64), 1), v(i(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld4.v1i64.p0v1i64")
+        },
+        "ld4_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), Some(v(u(64), 1)))],
+            output: agg(false, vec![v(u(64), 1), v(u(64), 1), v(u(64), 1), v(u(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld4.v1i64.p0v1i64")
+        },
+        "ld4_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(v(f(32), 2)))],
+            output: agg(false, vec![v(f(32), 2), v(f(32), 2), v(f(32), 2), v(f(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2f32.p0v2f32")
+        },
+        "ld4_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(v(f(64), 1)))],
+            output: agg(false, vec![v(f(64), 1), v(f(64), 1), v(f(64), 1), v(f(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld4.v1f64.p0v1f64")
+        },
+        "ld4q_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), Some(v(i(8), 16)))],
+            output: agg(false, vec![v(i(8), 16), v(i(8), 16), v(i(8), 16), v(i(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld4.v16i8.p0v16i8")
+        },
+        "ld4q_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), Some(v(u(8), 16)))],
+            output: agg(false, vec![v(u(8), 16), v(u(8), 16), v(u(8), 16), v(u(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld4.v16i8.p0v16i8")
+        },
+        "ld4q_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), Some(v(i(16), 8)))],
+            output: agg(false, vec![v(i(16), 8), v(i(16), 8), v(i(16), 8), v(i(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i16.p0v8i16")
+        },
+        "ld4q_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), Some(v(u(16), 8)))],
+            output: agg(false, vec![v(u(16), 8), v(u(16), 8), v(u(16), 8), v(u(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i16.p0v8i16")
+        },
+        "ld4q_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), Some(v(i(32), 4)))],
+            output: agg(false, vec![v(i(32), 4), v(i(32), 4), v(i(32), 4), v(i(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i32.p0v4i32")
+        },
+        "ld4q_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), Some(v(u(32), 4)))],
+            output: agg(false, vec![v(u(32), 4), v(u(32), 4), v(u(32), 4), v(u(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i32.p0v4i32")
+        },
+        "ld4q_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), Some(v(i(64), 2)))],
+            output: agg(false, vec![v(i(64), 2), v(i(64), 2), v(i(64), 2), v(i(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i64.p0v2i64")
+        },
+        "ld4q_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), Some(v(u(64), 2)))],
+            output: agg(false, vec![v(u(64), 2), v(u(64), 2), v(u(64), 2), v(u(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i64.p0v2i64")
+        },
+        "ld4q_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(v(f(32), 4)))],
+            output: agg(false, vec![v(f(32), 4), v(f(32), 4), v(f(32), 4), v(f(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4f32.p0v4f32")
+        },
+        "ld4q_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(v(f(64), 2)))],
+            output: agg(false, vec![v(f(64), 2), v(f(64), 2), v(f(64), 2), v(f(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2f64.p0v2f64")
+        },
+        "ld2_dup_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), None)],
+            output: agg(false, vec![v(i(8), 8), v(i(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i8.p0i8")
+        },
+        "ld2_dup_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), None)],
+            output: agg(false, vec![v(u(8), 8), v(u(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i8.p0i8")
+        },
+        "ld2_dup_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), None)],
+            output: agg(false, vec![v(i(16), 4), v(i(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i16.p0i16")
+        },
+        "ld2_dup_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), None)],
+            output: agg(false, vec![v(u(16), 4), v(u(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i16.p0i16")
+        },
+        "ld2_dup_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), None)],
+            output: agg(false, vec![v(i(32), 2), v(i(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i32.p0i32")
+        },
+        "ld2_dup_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), None)],
+            output: agg(false, vec![v(u(32), 2), v(u(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i32.p0i32")
+        },
+        "ld2_dup_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), None)],
+            output: agg(false, vec![v(i(64), 1), v(i(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld2.v1i64.p0i64")
+        },
+        "ld2_dup_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), None)],
+            output: agg(false, vec![v(u(64), 1), v(u(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld2.v1i64.p0i64")
+        },
+        "ld2_dup_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), None)],
+            output: agg(false, vec![v(f(32), 2), v(f(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2f32.p0f32")
+        },
+        "ld2_dup_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), None)],
+            output: agg(false, vec![v(f(64), 1), v(f(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld2.v1f64.p0f64")
+        },
+        "ld2q_dup_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), None)],
+            output: agg(false, vec![v(i(8), 16), v(i(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld2.v16i8.p0i8")
+        },
+        "ld2q_dup_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), None)],
+            output: agg(false, vec![v(u(8), 16), v(u(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld2.v16i8.p0i8")
+        },
+        "ld2q_dup_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), None)],
+            output: agg(false, vec![v(i(16), 8), v(i(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i16.p0i16")
+        },
+        "ld2q_dup_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), None)],
+            output: agg(false, vec![v(u(16), 8), v(u(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld2.v8i16.p0i16")
+        },
+        "ld2q_dup_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), None)],
+            output: agg(false, vec![v(i(32), 4), v(i(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i32.p0i32")
+        },
+        "ld2q_dup_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), None)],
+            output: agg(false, vec![v(u(32), 4), v(u(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4i32.p0i32")
+        },
+        "ld2q_dup_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), None)],
+            output: agg(false, vec![v(i(64), 2), v(i(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i64.p0i64")
+        },
+        "ld2q_dup_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), None)],
+            output: agg(false, vec![v(u(64), 2), v(u(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2i64.p0i64")
+        },
+        "ld2q_dup_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), None)],
+            output: agg(false, vec![v(f(32), 4), v(f(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld2.v4f32.p0f32")
+        },
+        "ld2q_dup_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), None)],
+            output: agg(false, vec![v(f(64), 2), v(f(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld2.v2f64.p0f64")
+        },
+        "ld3_dup_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), None)],
+            output: agg(false, vec![v(i(8), 8), v(i(8), 8), v(i(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i8.p0i8")
+        },
+        "ld3_dup_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), None)],
+            output: agg(false, vec![v(u(8), 8), v(u(8), 8), v(u(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i8.p0i8")
+        },
+        "ld3_dup_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), None)],
+            output: agg(false, vec![v(i(16), 4), v(i(16), 4), v(i(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i16.p0i16")
+        },
+        "ld3_dup_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), None)],
+            output: agg(false, vec![v(u(16), 4), v(u(16), 4), v(u(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i16.p0i16")
+        },
+        "ld3_dup_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), None)],
+            output: agg(false, vec![v(i(32), 2), v(i(32), 2), v(i(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i32.p0i32")
+        },
+        "ld3_dup_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), None)],
+            output: agg(false, vec![v(u(32), 2), v(u(32), 2), v(u(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i32.p0i32")
+        },
+        "ld3_dup_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), None)],
+            output: agg(false, vec![v(i(64), 1), v(i(64), 1), v(i(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld3.v1i64.p0i64")
+        },
+        "ld3_dup_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), None)],
+            output: agg(false, vec![v(u(64), 1), v(u(64), 1), v(u(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld3.v1i64.p0i64")
+        },
+        "ld3_dup_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), None)],
+            output: agg(false, vec![v(f(32), 2), v(f(32), 2), v(f(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2f32.p0f32")
+        },
+        "ld3_dup_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), None)],
+            output: agg(false, vec![v(f(64), 1), v(f(64), 1), v(f(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld3.v1f64.p0f64")
+        },
+        "ld3q_dup_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), None)],
+            output: agg(false, vec![v(i(8), 16), v(i(8), 16), v(i(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld3.v16i8.p0i8")
+        },
+        "ld3q_dup_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), None)],
+            output: agg(false, vec![v(u(8), 16), v(u(8), 16), v(u(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld3.v16i8.p0i8")
+        },
+        "ld3q_dup_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), None)],
+            output: agg(false, vec![v(i(16), 8), v(i(16), 8), v(i(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i16.p0i16")
+        },
+        "ld3q_dup_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), None)],
+            output: agg(false, vec![v(u(16), 8), v(u(16), 8), v(u(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld3.v8i16.p0i16")
+        },
+        "ld3q_dup_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), None)],
+            output: agg(false, vec![v(i(32), 4), v(i(32), 4), v(i(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i32.p0i32")
+        },
+        "ld3q_dup_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), None)],
+            output: agg(false, vec![v(u(32), 4), v(u(32), 4), v(u(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4i32.p0i32")
+        },
+        "ld3q_dup_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), None)],
+            output: agg(false, vec![v(i(64), 2), v(i(64), 2), v(i(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i64.p0i64")
+        },
+        "ld3q_dup_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), None)],
+            output: agg(false, vec![v(u(64), 2), v(u(64), 2), v(u(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2i64.p0i64")
+        },
+        "ld3q_dup_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), None)],
+            output: agg(false, vec![v(f(32), 4), v(f(32), 4), v(f(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld3.v4f32.p0f32")
+        },
+        "ld3q_dup_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), None)],
+            output: agg(false, vec![v(f(64), 2), v(f(64), 2), v(f(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld3.v2f64.p0f64")
+        },
+        "ld4_dup_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), None)],
+            output: agg(false, vec![v(i(8), 8), v(i(8), 8), v(i(8), 8), v(i(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i8.p0i8")
+        },
+        "ld4_dup_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), None)],
+            output: agg(false, vec![v(u(8), 8), v(u(8), 8), v(u(8), 8), v(u(8), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i8.p0i8")
+        },
+        "ld4_dup_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), None)],
+            output: agg(false, vec![v(i(16), 4), v(i(16), 4), v(i(16), 4), v(i(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i16.p0i16")
+        },
+        "ld4_dup_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), None)],
+            output: agg(false, vec![v(u(16), 4), v(u(16), 4), v(u(16), 4), v(u(16), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i16.p0i16")
+        },
+        "ld4_dup_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), None)],
+            output: agg(false, vec![v(i(32), 2), v(i(32), 2), v(i(32), 2), v(i(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i32.p0i32")
+        },
+        "ld4_dup_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), None)],
+            output: agg(false, vec![v(u(32), 2), v(u(32), 2), v(u(32), 2), v(u(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i32.p0i32")
+        },
+        "ld4_dup_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), None)],
+            output: agg(false, vec![v(i(64), 1), v(i(64), 1), v(i(64), 1), v(i(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld4.v1i64.p0i64")
+        },
+        "ld4_dup_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), None)],
+            output: agg(false, vec![v(u(64), 1), v(u(64), 1), v(u(64), 1), v(u(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld4.v1i64.p0i64")
+        },
+        "ld4_dup_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), None)],
+            output: agg(false, vec![v(f(32), 2), v(f(32), 2), v(f(32), 2), v(f(32), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2f32.p0f32")
+        },
+        "ld4_dup_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), None)],
+            output: agg(false, vec![v(f(64), 1), v(f(64), 1), v(f(64), 1), v(f(64), 1)]),
+            definition: Named("llvm.aarch64.neon.ld4.v1f64.p0f64")
+        },
+        "ld4q_dup_s8" => Intrinsic {
+            inputs: vec![p(true, i(8), None)],
+            output: agg(false, vec![v(i(8), 16), v(i(8), 16), v(i(8), 16), v(i(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld4.v16i8.p0i8")
+        },
+        "ld4q_dup_u8" => Intrinsic {
+            inputs: vec![p(true, u(8), None)],
+            output: agg(false, vec![v(u(8), 16), v(u(8), 16), v(u(8), 16), v(u(8), 16)]),
+            definition: Named("llvm.aarch64.neon.ld4.v16i8.p0i8")
+        },
+        "ld4q_dup_s16" => Intrinsic {
+            inputs: vec![p(true, i(16), None)],
+            output: agg(false, vec![v(i(16), 8), v(i(16), 8), v(i(16), 8), v(i(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i16.p0i16")
+        },
+        "ld4q_dup_u16" => Intrinsic {
+            inputs: vec![p(true, u(16), None)],
+            output: agg(false, vec![v(u(16), 8), v(u(16), 8), v(u(16), 8), v(u(16), 8)]),
+            definition: Named("llvm.aarch64.neon.ld4.v8i16.p0i16")
+        },
+        "ld4q_dup_s32" => Intrinsic {
+            inputs: vec![p(true, i(32), None)],
+            output: agg(false, vec![v(i(32), 4), v(i(32), 4), v(i(32), 4), v(i(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i32.p0i32")
+        },
+        "ld4q_dup_u32" => Intrinsic {
+            inputs: vec![p(true, u(32), None)],
+            output: agg(false, vec![v(u(32), 4), v(u(32), 4), v(u(32), 4), v(u(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4i32.p0i32")
+        },
+        "ld4q_dup_s64" => Intrinsic {
+            inputs: vec![p(true, i(64), None)],
+            output: agg(false, vec![v(i(64), 2), v(i(64), 2), v(i(64), 2), v(i(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i64.p0i64")
+        },
+        "ld4q_dup_u64" => Intrinsic {
+            inputs: vec![p(true, u(64), None)],
+            output: agg(false, vec![v(u(64), 2), v(u(64), 2), v(u(64), 2), v(u(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2i64.p0i64")
+        },
+        "ld4q_dup_f32" => Intrinsic {
+            inputs: vec![p(true, f(32), None)],
+            output: agg(false, vec![v(f(32), 4), v(f(32), 4), v(f(32), 4), v(f(32), 4)]),
+            definition: Named("llvm.aarch64.neon.ld4.v4f32.p0f32")
+        },
+        "ld4q_dup_f64" => Intrinsic {
+            inputs: vec![p(true, f(64), None)],
+            output: agg(false, vec![v(f(64), 2), v(f(64), 2), v(f(64), 2), v(f(64), 2)]),
+            definition: Named("llvm.aarch64.neon.ld4.v2f64.p0f64")
+        },
        "padd_s8" => Intrinsic {
            inputs: vec![v(i(8), 8), v(i(8), 8)],
            output: v(i(8), 8),
--- a/src/librustc_platform_intrinsics/arm.rs
+++ b/src/librustc_platform_intrinsics/arm.rs
@ -13,7 +13,7 @@

 #![allow(unused_imports)]

-use {Intrinsic, i, u, f, v, agg};
+use {Intrinsic, i, i_, u, u_, f, v, agg, p};
 use IntrinsicDef::Named;
 use rustc::middle::ty;

--- a/src/librustc_platform_intrinsics/lib.rs
+++ b/src/librustc_platform_intrinsics/lib.rs
@ -30,10 +30,11 @@ pub struct Intrinsic {

 #[derive(Clone, Hash, Eq, PartialEq)]
 pub enum Type {
+    Void,
    Integer(/* signed */ bool, u8, /* llvm width */ u8),
    Float(u8),
-    Pointer(Box<Type>),
-    Vector(Box<Type>, u8),
+    Pointer(Box<Type>, Option<Box<Type>>, /* const */ bool),
+    Vector(Box<Type>, Option<Box<Type>>, u8),
    Aggregate(bool, Vec<Type>),
 }

@ -47,10 +48,19 @@ fn u(width: u8) -> Type { Type::Integer(false, width, width) }
 #[allow(dead_code)]
 fn u_(width: u8, llvm_width: u8) -> Type { Type::Integer(false, width, llvm_width) }
 fn f(width: u8) -> Type { Type::Float(width) }
-fn v(x: Type, length: u8) -> Type { Type::Vector(Box::new(x), length) }
+fn v(x: Type, length: u8) -> Type { Type::Vector(Box::new(x), None, length) }
+fn v_(x: Type, bitcast: Type, length: u8) -> Type {
+    Type::Vector(Box::new(x), Some(Box::new(bitcast)), length)
+}
 fn agg(flatten: bool, types: Vec<Type>) -> Type {
    Type::Aggregate(flatten, types)
 }
+fn p(const_: bool, elem: Type, llvm_elem: Option<Type>) -> Type {
+    Type::Pointer(Box::new(elem), llvm_elem.map(Box::new), const_)
+}
+fn void() -> Type {
+    Type::Void
+}

 mod x86;
 mod arm;
--- a/src/librustc_platform_intrinsics/x86.rs
+++ b/src/librustc_platform_intrinsics/x86.rs
@ -13,7 +13,7 @@

 #![allow(unused_imports)]

-use {Intrinsic, i, i_, u, u_, f, v, agg};
+use {Intrinsic, i, i_, u, u_, f, v, v_, agg, p, void};
 use IntrinsicDef::Named;
 use rustc::middle::ty;

@ -50,6 +50,11 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(f(32), 4),
            definition: Named("llvm.sqrt.v4f32")
        },
+        "_storeu_ps" => Intrinsic {
+            inputs: vec![p(false, f(32), Some(i(8))), v(f(32), 4)],
+            output: void(),
+            definition: Named("llvm.x86.sse.storeu.ps")
+        },
        "_adds_epi8" => Intrinsic {
            inputs: vec![v(i(8), 16), v(i(8), 16)],
            output: v(i(8), 16),
@ -80,11 +85,21 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(u(16), 8),
            definition: Named("llvm.x86.sse2.pavg.w")
        },
+        "_lfence" => Intrinsic {
+            inputs: vec![],
+            output: void(),
+            definition: Named("llvm.x86.sse2.lfence")
+        },
        "_madd_epi16" => Intrinsic {
            inputs: vec![v(i(16), 8), v(i(16), 8)],
            output: v(i(32), 4),
            definition: Named("llvm.x86.sse2.pmadd.wd")
        },
+        "_maskmoveu_si128" => Intrinsic {
+            inputs: vec![v(u(8), 16), v(u(8), 16), p(false, u(8), None)],
+            output: void(),
+            definition: Named("llvm.x86.sse2.maskmov.dqu")
+        },
        "_max_epi16" => Intrinsic {
            inputs: vec![v(i(16), 8), v(i(16), 8)],
            output: v(i(16), 8),
@ -100,6 +115,11 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(f(64), 2),
            definition: Named("llvm.x86.sse2.max.pd")
        },
+        "_mfence" => Intrinsic {
+            inputs: vec![],
+            output: void(),
+            definition: Named("llvm.x86.sse2.fence")
+        },
        "_min_epi16" => Intrinsic {
            inputs: vec![v(i(16), 8), v(i(16), 8)],
            output: v(i(16), 8),
@ -160,11 +180,26 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(u(64), 2),
            definition: Named("llvm.x86.sse2.psad.bw")
        },
+        "_sfence" => Intrinsic {
+            inputs: vec![],
+            output: void(),
+            definition: Named("llvm.x86.sse2.sfence")
+        },
        "_sqrt_pd" => Intrinsic {
            inputs: vec![v(f(64), 2)],
            output: v(f(64), 2),
            definition: Named("llvm.sqrt.v2f64")
        },
+        "_storeu_pd" => Intrinsic {
+            inputs: vec![p(false, f(64), Some(u(8))), v(f(64), 2)],
+            output: void(),
+            definition: Named("llvm.x86.sse2.storeu.pd")
+        },
+        "_storeu_si128" => Intrinsic {
+            inputs: vec![p(false, v(u(8), 16), Some(u(8))), v(u(8), 16)],
+            output: void(),
+            definition: Named("llvm.x86.sse2.storeu.dq")
+        },
        "_subs_epi8" => Intrinsic {
            inputs: vec![v(i(8), 16), v(i(8), 16)],
            output: v(i(8), 16),
@ -215,6 +250,11 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(f(64), 2),
            definition: Named("llvm.x86.sse3.hsub.pd")
        },
+        "_lddqu_si128" => Intrinsic {
+            inputs: vec![p(true, v(u(8), 16), Some(i(8)))],
+            output: v(u(8), 16),
+            definition: Named("llvm.x86.sse3.ldu.dq")
+        },
        "_abs_epi8" => Intrinsic {
            inputs: vec![v(i(8), 16)],
            output: v(i(8), 16),
@ -490,6 +530,46 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(f(64), 4),
            definition: Named("llvm.x86.avx.max.pd.256")
        },
+        "_maskload_ps" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(i(8))), v_(i(32), f(32), 4)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.avx.maskload.ps")
+        },
+        "_maskload_pd" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(i(8))), v_(i(64), f(64), 2)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.avx.maskload.pd")
+        },
+        "256_maskload_ps" => Intrinsic {
+            inputs: vec![p(true, f(32), Some(i(8))), v_(i(32), f(32), 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.avx.maskload.ps.256")
+        },
+        "256_maskload_pd" => Intrinsic {
+            inputs: vec![p(true, f(64), Some(i(8))), v_(i(64), f(64), 4)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.avx.maskload.pd.256")
+        },
+        "_maskstore_ps" => Intrinsic {
+            inputs: vec![p(false, f(32), Some(i(8))), v_(i(32), f(32), 4), v(f(32), 4)],
+            output: void(),
+            definition: Named("llvm.x86.avx.maskstore.ps")
+        },
+        "_maskstore_pd" => Intrinsic {
+            inputs: vec![p(false, f(64), Some(i(8))), v_(i(64), f(64), 2), v(f(64), 2)],
+            output: void(),
+            definition: Named("llvm.x86.avx.maskstore.pd")
+        },
+        "256_maskstore_ps" => Intrinsic {
+            inputs: vec![p(false, f(32), Some(i(8))), v_(i(32), f(32), 8), v(f(32), 8)],
+            output: void(),
+            definition: Named("llvm.x86.avx.maskstore.ps.256")
+        },
+        "256_maskstore_pd" => Intrinsic {
+            inputs: vec![p(false, f(64), Some(i(8))), v_(i(64), f(64), 4), v(f(64), 4)],
+            output: void(),
+            definition: Named("llvm.x86.avx.maskstore.pd.256")
+        },
        "256_min_ps" => Intrinsic {
            inputs: vec![v(f(32), 8), v(f(32), 8)],
            output: v(f(32), 8),
@ -540,6 +620,21 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(f(32), 8),
            definition: Named("llvm.x86.avx.rsqrt.ps.256")
        },
+        "256_storeu_ps" => Intrinsic {
+            inputs: vec![p(false, v(f(32), 8), Some(u(8))), v(f(32), 8)],
+            output: void(),
+            definition: Named("llvm.x86.avx.storeu.ps.256")
+        },
+        "256_storeu_pd" => Intrinsic {
+            inputs: vec![p(false, v(f(64), 4), Some(u(8))), v(f(64), 4)],
+            output: void(),
+            definition: Named("llvm.x86.avx.storeu.ps.256")
+        },
+        "256_storeu_si256" => Intrinsic {
+            inputs: vec![p(false, v(u(8), 32), Some(u(8))), v(u(8), 32)],
+            output: void(),
+            definition: Named("llvm.x86.avx.storeu.dq.256")
+        },
        "256_sqrt_ps" => Intrinsic {
            inputs: vec![v(f(32), 8)],
            output: v(f(32), 8),
@ -625,50 +720,60 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: i(32),
            definition: Named("llvm.x86.avx.ptestz.256")
        },
+        "256_zeroall" => Intrinsic {
+            inputs: vec![],
+            output: void(),
+            definition: Named("llvm.x86.avx.vzeroall")
+        },
+        "256_zeroupper" => Intrinsic {
+            inputs: vec![],
+            output: void(),
+            definition: Named("llvm.x86.avx.vzeroupper")
+        },
        "256_abs_epi8" => Intrinsic {
            inputs: vec![v(i(8), 32)],
            output: v(i(8), 32),
-            definition: Named("llvm.x86.avx2.avx2.pabs.b")
+            definition: Named("llvm.x86.avx2.pabs.b")
        },
        "256_abs_epi16" => Intrinsic {
            inputs: vec![v(i(16), 16)],
            output: v(i(16), 16),
-            definition: Named("llvm.x86.avx2.avx2.pabs.w")
+            definition: Named("llvm.x86.avx2.pabs.w")
        },
        "256_abs_epi32" => Intrinsic {
            inputs: vec![v(i(32), 8)],
            output: v(i(32), 8),
-            definition: Named("llvm.x86.avx2.avx2.pabs.d")
+            definition: Named("llvm.x86.avx2.pabs.d")
        },
        "256_adds_epi8" => Intrinsic {
            inputs: vec![v(i(8), 32), v(i(8), 32)],
            output: v(i(8), 32),
-            definition: Named("llvm.x86.avx2.avx2.padds.b")
+            definition: Named("llvm.x86.avx2.padds.b")
        },
        "256_adds_epu8" => Intrinsic {
            inputs: vec![v(u(8), 32), v(u(8), 32)],
            output: v(u(8), 32),
-            definition: Named("llvm.x86.avx2.avx2.paddus.b")
+            definition: Named("llvm.x86.avx2.paddus.b")
        },
        "256_adds_epi16" => Intrinsic {
            inputs: vec![v(i(16), 16), v(i(16), 16)],
            output: v(i(16), 16),
-            definition: Named("llvm.x86.avx2.avx2.padds.w")
+            definition: Named("llvm.x86.avx2.padds.w")
        },
        "256_adds_epu16" => Intrinsic {
            inputs: vec![v(u(16), 16), v(u(16), 16)],
            output: v(u(16), 16),
-            definition: Named("llvm.x86.avx2.avx2.paddus.w")
+            definition: Named("llvm.x86.avx2.paddus.w")
        },
        "256_avg_epu8" => Intrinsic {
            inputs: vec![v(u(8), 32), v(u(8), 32)],
            output: v(u(8), 32),
-            definition: Named("llvm.x86.avx2.avx2.pavg.b")
+            definition: Named("llvm.x86.avx2.pavg.b")
        },
        "256_avg_epu16" => Intrinsic {
            inputs: vec![v(u(16), 16), v(u(16), 16)],
            output: v(u(16), 16),
-            definition: Named("llvm.x86.avx2.avx2.pavg.w")
+            definition: Named("llvm.x86.avx2.pavg.w")
        },
        "256_hadd_epi16" => Intrinsic {
            inputs: vec![v(i(16), 16), v(i(16), 16)],
@ -710,6 +815,126 @@ pub fn find<'tcx>(_tcx: &ty::ctxt<'tcx>, name: &str) -> Option<Intrinsic> {
            output: v(i(16), 16),
            definition: Named("llvm.x86.avx2.pmadd.ub.sw")
        },
+        "_mask_i32gather_epi32" => Intrinsic {
+            inputs: vec![v(i(32), 4), p(true, i(32), Some(i(8))), v(i(32), 4), v(i(32), 4), i_(32, 8)],
+            output: v(i(32), 4),
+            definition: Named("llvm.x86.avx2.gather.d.d")
+        },
+        "_mask_i32gather_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), p(true, f(32), Some(i(8))), v(i(32), 4), v_(i(32), f(32), 4), i_(32, 8)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.avx2.gather.d.ps")
+        },
+        "256_mask_i32gather_epi32" => Intrinsic {
+            inputs: vec![v(i(32), 8), p(true, i(32), Some(i(8))), v(i(32), 8), v(i(32), 8), i_(32, 8)],
+            output: v(i(32), 8),
+            definition: Named("llvm.x86.avx2.gather.d.d.256")
+        },
+        "256_mask_i32gather_ps" => Intrinsic {
+            inputs: vec![v(f(32), 8), p(true, f(32), Some(i(8))), v(i(32), 8), v_(i(32), f(32), 8), i_(32, 8)],
+            output: v(f(32), 8),
+            definition: Named("llvm.x86.avx2.gather.d.ps.256")
+        },
+        "_mask_i32gather_epi64" => Intrinsic {
+            inputs: vec![v(i(64), 2), p(true, i(64), Some(i(8))), v(i(32), 4), v(i(64), 2), i_(32, 8)],
+            output: v(i(64), 2),
+            definition: Named("llvm.x86.avx2.gather.d.q")
+        },
+        "_mask_i32gather_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), p(true, f(64), Some(i(8))), v(i(32), 4), v_(i(64), f(64), 2), i_(32, 8)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.avx2.gather.d.pd")
+        },
+        "256_mask_i32gather_epi64" => Intrinsic {
+            inputs: vec![v(i(64), 4), p(true, i(64), Some(i(8))), v(i(32), 4), v(i(64), 4), i_(32, 8)],
+            output: v(i(64), 4),
+            definition: Named("llvm.x86.avx2.gather.d.q.256")
+        },
+        "256_mask_i32gather_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), p(true, f(64), Some(i(8))), v(i(32), 4), v_(i(64), f(64), 4), i_(32, 8)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.avx2.gather.d.pd.256")
+        },
+        "_mask_i64gather_epi32" => Intrinsic {
+            inputs: vec![v(i(32), 4), p(true, i(32), Some(i(8))), v(i(64), 2), v(i(32), 4), i_(32, 8)],
+            output: v(i(32), 4),
+            definition: Named("llvm.x86.avx2.gather.q.d")
+        },
+        "_mask_i64gather_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), p(true, f(32), Some(i(8))), v(i(64), 2), v_(i(32), f(32), 4), i_(32, 8)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.avx2.gather.q.ps")
+        },
+        "256_mask_i64gather_epi32" => Intrinsic {
+            inputs: vec![v(i(32), 4), p(true, i(32), Some(i(8))), v(i(64), 4), v(i(32), 4), i_(32, 8)],
+            output: v(i(32), 4),
+            definition: Named("llvm.x86.avx2.gather.q.d")
+        },
+        "256_mask_i64gather_ps" => Intrinsic {
+            inputs: vec![v(f(32), 4), p(true, f(32), Some(i(8))), v(i(64), 4), v_(i(32), f(32), 4), i_(32, 8)],
+            output: v(f(32), 4),
+            definition: Named("llvm.x86.avx2.gather.q.ps")
+        },
+        "_mask_i64gather_epi64" => Intrinsic {
+            inputs: vec![v(i(64), 2), p(true, i(64), Some(i(8))), v(i(64), 2), v(i(64), 2), i_(32, 8)],
+            output: v(i(64), 2),
+            definition: Named("llvm.x86.avx2.gather.q.q")
+        },
+        "_mask_i64gather_pd" => Intrinsic {
+            inputs: vec![v(f(64), 2), p(true, f(64), Some(i(8))), v(i(64), 2), v_(i(64), f(64), 2), i_(32, 8)],
+            output: v(f(64), 2),
+            definition: Named("llvm.x86.avx2.gather.q.pd")
+        },
+        "256_mask_i64gather_epi64" => Intrinsic {
+            inputs: vec![v(i(64), 4), p(true, i(64), Some(i(8))), v(i(64), 4), v(i(64), 4), i_(32, 8)],
+            output: v(i(64), 4),
+            definition: Named("llvm.x86.avx2.gather.q.q.256")
+        },
+        "256_mask_i64gather_pd" => Intrinsic {
+            inputs: vec![v(f(64), 4), p(true, f(64), Some(i(8))), v(i(64), 4), v_(i(64), f(64), 4), i_(32, 8)],
+            output: v(f(64), 4),
+            definition: Named("llvm.x86.avx2.gather.q.pd.256")
+        },
+        "_maskload_epi32" => Intrinsic {
+            inputs: vec![p(true, v(i(32), 4), Some(i(8))), v(i(32), 4)],
+            output: v(i(32), 4),
+            definition: Named("llvm.x86.avx2.maskload.d")
+        },
+        "_maskload_epi64" => Intrinsic {
+            inputs: vec![p(true, v(i(64), 2), Some(i(8))), v(i(64), 2)],
+            output: v(i(64), 2),
+            definition: Named("llvm.x86.avx2.maskload.q")
+        },
+        "256_maskload_epi32" => Intrinsic {
+            inputs: vec![p(true, v(i(32), 8), Some(i(8))), v(i(32), 8)],
+            output: v(i(32), 8),
+            definition: Named("llvm.x86.avx2.maskload.d.256")
+        },
+        "256_maskload_epi64" => Intrinsic {
+            inputs: vec![p(true, v(i(64), 4), Some(i(8))), v(i(64), 4)],
+            output: v(i(64), 4),
+            definition: Named("llvm.x86.avx2.maskload.q.256")
+        },
+        "_maskstore_epi32" => Intrinsic {
+            inputs: vec![p(false, i(32), Some(i(8))), v(i(32), 4), v(i(32), 4)],
+            output: void(),
+            definition: Named("llvm.x86.avx2.maskstore.d")
+        },
+        "_maskstore_epi64" => Intrinsic {
+            inputs: vec![p(false, i(64), Some(i(8))), v(i(64), 2), v(i(64), 2)],
+            output: void(),
+            definition: Named("llvm.x86.avx2.maskstore.q")
+        },
+        "256_maskstore_epi32" => Intrinsic {
+            inputs: vec![p(false, i(32), Some(i(8))), v(i(32), 8), v(i(32), 8)],
+            output: void(),
+            definition: Named("llvm.x86.avx2.maskstore.d.256")
+        },
+        "256_maskstore_epi64" => Intrinsic {
+            inputs: vec![p(false, i(64), Some(i(8))), v(i(64), 4), v(i(64), 4)],
+            output: void(),
+            definition: Named("llvm.x86.avx2.maskstore.q.256")
+        },
        "256_max_epi8" => Intrinsic {
            inputs: vec![v(i(8), 32), v(i(8), 32)],
            output: v(i(8), 32),
--- a/src/librustc_trans/trans/intrinsic.rs
+++ b/src/librustc_trans/trans/intrinsic.rs
@ -936,6 +936,7 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
                          any_changes_needed: &mut bool) -> Vec<Type> {
                use intrinsics::Type::*;
                match *t {
+                    Void => vec![Type::void(ccx)],
                    Integer(_signed, width, llvm_width) => {
                        *any_changes_needed |= width != llvm_width;
                        vec![Type::ix(ccx, llvm_width as u64)]
@ -947,14 +948,29 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
                            _ => unreachable!()
                        }
                    }
-                    Pointer(_) => unimplemented!(),
-                    Vector(ref t, length) => {
+                    Pointer(ref t, ref llvm_elem, _const) => {
+                        *any_changes_needed |= llvm_elem.is_some();
+
+                        let t = llvm_elem.as_ref().unwrap_or(t);
+                        let elem = one(ty_to_type(ccx, t,
+                                                  any_changes_needed));
+                        vec![elem.ptr_to()]
+                    }
+                    Vector(ref t, ref llvm_elem, length) => {
+                        *any_changes_needed |= llvm_elem.is_some();
+
+                        let t = llvm_elem.as_ref().unwrap_or(t);
                        let elem = one(ty_to_type(ccx, t,
                                                  any_changes_needed));
                        vec![Type::vector(&elem,
                                          length as u64)]
                    }
-                    Aggregate(false, _) => unimplemented!(),
+                    Aggregate(false, ref contents) => {
+                        let elems = contents.iter()
+                                            .map(|t| one(ty_to_type(ccx, t, any_changes_needed)))
+                                            .collect::<Vec<_>>();
+                        vec![Type::struct_(ccx, &elems, false)]
+                    }
                    Aggregate(true, ref contents) => {
                        *any_changes_needed = true;
                        contents.iter()
@ -965,8 +981,9 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
            }

            // This allows an argument list like `foo, (bar, baz),
-            // qux` to be converted into `foo, bar, baz, qux`, and
-            // integer arguments to be truncated as needed.
+            // qux` to be converted into `foo, bar, baz, qux`, integer
+            // arguments to be truncated as needed and pointers to be
+            // cast.
            fn modify_as_needed<'blk, 'tcx>(bcx: Block<'blk, 'tcx>,
                                            t: &intrinsics::Type,
                                            arg_type: Ty<'tcx>,
@ -991,6 +1008,16 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
                            })
                            .collect()
                    }
+                    intrinsics::Type::Pointer(_, Some(ref llvm_elem), _) => {
+                        let llvm_elem = one(ty_to_type(bcx.ccx(), llvm_elem, &mut false));
+                        vec![PointerCast(bcx, llarg,
+                                         llvm_elem.ptr_to())]
+                    }
+                    intrinsics::Type::Vector(_, Some(ref llvm_elem), length) => {
+                        let llvm_elem = one(ty_to_type(bcx.ccx(), llvm_elem, &mut false));
+                        vec![BitCast(bcx, llarg,
+                                     Type::vector(&llvm_elem, length as u64))]
+                    }
                    intrinsics::Type::Integer(_, width, llvm_width) if width != llvm_width => {
                        // the LLVM intrinsic uses a smaller integer
                        // size than the C intrinsic's signature, so
@ -1027,7 +1054,7 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
            };
            assert_eq!(inputs.len(), llargs.len());

-            match intr.definition {
+            let val = match intr.definition {
                intrinsics::IntrinsicDef::Named(name) => {
                    let f = declare::declare_cfn(ccx,
                                                 name,
@ -1035,6 +1062,20 @@ pub fn trans_intrinsic_call<'a, 'blk, 'tcx>(mut bcx: Block<'blk, 'tcx>,
                                                 tcx.mk_nil());
                    Call(bcx, f, &llargs, None, call_debug_location)
                }
+            };
+
+            match intr.output {
+                intrinsics::Type::Aggregate(flatten, ref elems) => {
+                    // the output is a tuple so we need to munge it properly
+                    assert!(!flatten);
+
+                    for i in 0..elems.len() {
+                        let val = ExtractValue(bcx, val, i);
+                        Store(bcx, val, StructGEP(bcx, llresult, i));
+                    }
+                    C_nil(ccx)
+                }
+                _ => val,
            }
        }
    };
--- a/src/librustc_typeck/check/intrinsic.rs
+++ b/src/librustc_typeck/check/intrinsic.rs
@ -464,6 +464,10 @@ fn match_intrinsic_type_to_type<'tcx, 'a>(
    };

    match *expected {
+        Void => match t.sty {
+            ty::TyTuple(ref v) if v.is_empty() => {},
+            _ => simple_error(&format!("`{}`", t), "()"),
+        },
        // (The width we pass to LLVM doesn't concern the type checker.)
        Integer(signed, bits, _llvm_width) => match (signed, bits, &t.sty) {
            (true,  8,  &ty::TyInt(hir::IntTy::TyI8)) |
@ -485,8 +489,21 @@ fn match_intrinsic_type_to_type<'tcx, 'a>(
            _ => simple_error(&format!("`{}`", t),
                              &format!("`f{n}`", n = bits)),
        },
-        Pointer(_) => unimplemented!(),
-        Vector(ref inner_expected, len) => {
+        Pointer(ref inner_expected, ref _llvm_type, const_) => {
+            match t.sty {
+                ty::TyRawPtr(ty::TypeAndMut { ty, mutbl }) => {
+                    if (mutbl == hir::MutImmutable) != const_ {
+                        simple_error(&format!("`{}`", t),
+                                     if const_ {"const pointer"} else {"mut pointer"})
+                    }
+                    match_intrinsic_type_to_type(tcx, position, span, structural_to_nominal,
+                                                 inner_expected, ty)
+                }
+                _ => simple_error(&format!("`{}`", t),
+                                  &format!("raw pointer")),
+            }
+        }
+        Vector(ref inner_expected, ref _llvm_type, len) => {
            if !t.is_simd() {
                simple_error(&format!("non-simd type `{}`", t),
                             "simd type");