ppv_lite86/x86_64/
mod.rs

1// crate minimums: sse2, x86_64
2
3use crate::types::*;
4use core::arch::x86_64::{__m128i, __m256i};
5
6mod sse2;
7
8#[derive(Copy, Clone)]
9pub struct YesS3;
10#[derive(Copy, Clone)]
11pub struct NoS3;
12
13#[derive(Copy, Clone)]
14pub struct YesS4;
15#[derive(Copy, Clone)]
16pub struct NoS4;
17
18#[derive(Copy, Clone)]
19pub struct YesA1;
20#[derive(Copy, Clone)]
21pub struct NoA1;
22
23#[derive(Copy, Clone)]
24pub struct YesA2;
25#[derive(Copy, Clone)]
26pub struct NoA2;
27
28#[derive(Copy, Clone)]
29pub struct YesNI;
30#[derive(Copy, Clone)]
31pub struct NoNI;
32
33use core::marker::PhantomData;
34
35#[derive(Copy, Clone)]
36pub struct SseMachine<S3, S4, NI>(PhantomData<(S3, S4, NI)>);
37impl<S3: Copy, S4: Copy, NI: Copy> Machine for SseMachine<S3, S4, NI>
38where
39    sse2::u128x1_sse2<S3, S4, NI>: Swap64,
40    sse2::u64x2_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
41    sse2::u32x4_sse2<S3, S4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
42    sse2::u64x4_sse2<S3, S4, NI>: BSwap + Words4,
43    sse2::u128x1_sse2<S3, S4, NI>: BSwap,
44    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x2x2_sse2<S3, S4, NI>>,
45    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u64x4_sse2<S3, S4, NI>>,
46    sse2::u128x2_sse2<S3, S4, NI>: Into<sse2::u32x4x2_sse2<S3, S4, NI>>,
47    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u64x2x4_sse2<S3, S4, NI>>,
48    sse2::u128x4_sse2<S3, S4, NI>: Into<sse2::u32x4x4_sse2<S3, S4, NI>>,
49{
50    type u32x4 = sse2::u32x4_sse2<S3, S4, NI>;
51    type u64x2 = sse2::u64x2_sse2<S3, S4, NI>;
52    type u128x1 = sse2::u128x1_sse2<S3, S4, NI>;
53
54    type u32x4x2 = sse2::u32x4x2_sse2<S3, S4, NI>;
55    type u64x2x2 = sse2::u64x2x2_sse2<S3, S4, NI>;
56    type u64x4 = sse2::u64x4_sse2<S3, S4, NI>;
57    type u128x2 = sse2::u128x2_sse2<S3, S4, NI>;
58
59    type u32x4x4 = sse2::u32x4x4_sse2<S3, S4, NI>;
60    type u64x2x4 = sse2::u64x2x4_sse2<S3, S4, NI>;
61    type u128x4 = sse2::u128x4_sse2<S3, S4, NI>;
62
63    #[inline(always)]
64    unsafe fn instance() -> Self {
65        SseMachine(PhantomData)
66    }
67}
68
69#[derive(Copy, Clone)]
70pub struct Avx2Machine<NI>(PhantomData<NI>);
71impl<NI: Copy> Machine for Avx2Machine<NI>
72where
73    sse2::u128x1_sse2<YesS3, YesS4, NI>: BSwap + Swap64,
74    sse2::u64x2_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u64; 2]> + Vec2<u64>,
75    sse2::u32x4_sse2<YesS3, YesS4, NI>: BSwap + RotateEachWord32 + MultiLane<[u32; 4]> + Vec4<u32>,
76    sse2::u64x4_sse2<YesS3, YesS4, NI>: BSwap + Words4,
77{
78    type u32x4 = sse2::u32x4_sse2<YesS3, YesS4, NI>;
79    type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
80    type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
81
82    type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
83    type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
84    type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
85    type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
86
87    type u32x4x4 = sse2::avx2::u32x4x4_avx2<NI>;
88    type u64x2x4 = sse2::u64x2x4_sse2<YesS3, YesS4, NI>;
89    type u128x4 = sse2::u128x4_sse2<YesS3, YesS4, NI>;
90
91    #[inline(always)]
92    unsafe fn instance() -> Self {
93        Avx2Machine(PhantomData)
94    }
95}
96
97pub type SSE2 = SseMachine<NoS3, NoS4, NoNI>;
98pub type SSSE3 = SseMachine<YesS3, NoS4, NoNI>;
99pub type SSE41 = SseMachine<YesS3, YesS4, NoNI>;
100/// AVX but not AVX2: only 128-bit integer operations, but use VEX versions of everything
101/// to avoid expensive SSE/VEX conflicts.
102pub type AVX = SseMachine<YesS3, YesS4, NoNI>;
103pub type AVX2 = Avx2Machine<NoNI>;
104
105/// Generic wrapper for unparameterized storage of any of the possible impls.
106/// Converting into and out of this type should be essentially free, although it may be more
107/// aligned than a particular impl requires.
108#[allow(non_camel_case_types)]
109#[derive(Copy, Clone)]
110pub union vec128_storage {
111    u32x4: [u32; 4],
112    u64x2: [u64; 2],
113    u128x1: [u128; 1],
114    sse2: __m128i,
115}
116impl Store<vec128_storage> for vec128_storage {
117    #[inline(always)]
118    unsafe fn unpack(p: vec128_storage) -> Self {
119        p
120    }
121}
122impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
123    #[inline(always)]
124    fn into(self) -> &'a [u32; 4] {
125        unsafe { &self.u32x4 }
126    }
127}
128impl Into<vec128_storage> for [u32; 4] {
129    #[inline(always)]
130    fn into(self) -> vec128_storage {
131        vec128_storage { u32x4: self }
132    }
133}
134impl Default for vec128_storage {
135    #[inline(always)]
136    fn default() -> Self {
137        vec128_storage { u128x1: [0] }
138    }
139}
140impl Eq for vec128_storage {}
141impl PartialEq for vec128_storage {
142    #[inline(always)]
143    fn eq(&self, rhs: &Self) -> bool {
144        unsafe { self.u128x1 == rhs.u128x1 }
145    }
146}
147
148#[allow(non_camel_case_types)]
149#[derive(Copy, Clone)]
150pub union vec256_storage {
151    u32x8: [u32; 8],
152    u64x4: [u64; 4],
153    u128x2: [u128; 2],
154    sse2: [vec128_storage; 2],
155    avx: __m256i,
156}
157impl Into<vec256_storage> for [u64; 4] {
158    #[inline(always)]
159    fn into(self) -> vec256_storage {
160        vec256_storage { u64x4: self }
161    }
162}
163impl Default for vec256_storage {
164    #[inline(always)]
165    fn default() -> Self {
166        vec256_storage { u128x2: [0, 0] }
167    }
168}
169impl vec256_storage {
170    pub fn new128(xs: [vec128_storage; 2]) -> Self {
171        Self { sse2: xs }
172    }
173    pub fn split128(self) -> [vec128_storage; 2] {
174        unsafe { self.sse2 }
175    }
176}
177impl Eq for vec256_storage {}
178impl PartialEq for vec256_storage {
179    #[inline(always)]
180    fn eq(&self, rhs: &Self) -> bool {
181        unsafe { self.sse2 == rhs.sse2 }
182    }
183}
184
185#[allow(non_camel_case_types)]
186#[derive(Copy, Clone)]
187pub union vec512_storage {
188    u32x16: [u32; 16],
189    u64x8: [u64; 8],
190    u128x4: [u128; 4],
191    sse2: [vec128_storage; 4],
192    avx: [vec256_storage; 2],
193}
194impl Default for vec512_storage {
195    #[inline(always)]
196    fn default() -> Self {
197        vec512_storage {
198            u128x4: [0, 0, 0, 0],
199        }
200    }
201}
202impl vec512_storage {
203    pub fn new128(xs: [vec128_storage; 4]) -> Self {
204        Self { sse2: xs }
205    }
206    pub fn split128(self) -> [vec128_storage; 4] {
207        unsafe { self.sse2 }
208    }
209}
210impl Eq for vec512_storage {}
211impl PartialEq for vec512_storage {
212    #[inline(always)]
213    fn eq(&self, rhs: &Self) -> bool {
214        unsafe { self.avx == rhs.avx }
215    }
216}
217
218macro_rules! impl_into {
219    ($storage:ident, $array:ty, $name:ident) => {
220        impl Into<$array> for $storage {
221            #[inline(always)]
222            fn into(self) -> $array {
223                unsafe { self.$name }
224            }
225        }
226    };
227}
228impl_into!(vec128_storage, [u32; 4], u32x4);
229impl_into!(vec128_storage, [u64; 2], u64x2);
230impl_into!(vec128_storage, [u128; 1], u128x1);
231impl_into!(vec256_storage, [u32; 8], u32x8);
232impl_into!(vec256_storage, [u64; 4], u64x4);
233impl_into!(vec256_storage, [u128; 2], u128x2);
234impl_into!(vec512_storage, [u32; 16], u32x16);
235impl_into!(vec512_storage, [u64; 8], u64x8);
236impl_into!(vec512_storage, [u128; 4], u128x4);
237
238/// Generate the full set of optimized implementations to take advantage of the most important
239/// hardware feature sets.
240///
241/// This dispatcher is suitable for maximizing throughput.
242#[macro_export]
243macro_rules! dispatch {
244    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
245        #[cfg(feature = "std")]
246        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
247            #[inline(always)]
248            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
249            use std::arch::x86_64::*;
250            #[target_feature(enable = "avx2")]
251            unsafe fn impl_avx2($($arg: $argty),*) -> $ret {
252                let ret = fn_impl($crate::x86_64::AVX2::instance(), $($arg),*);
253                _mm256_zeroupper();
254                ret
255            }
256            #[target_feature(enable = "avx")]
257            #[target_feature(enable = "sse4.1")]
258            #[target_feature(enable = "ssse3")]
259            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
260                let ret = fn_impl($crate::x86_64::AVX::instance(), $($arg),*);
261                _mm256_zeroupper();
262                ret
263            }
264            #[target_feature(enable = "sse4.1")]
265            #[target_feature(enable = "ssse3")]
266            unsafe fn impl_sse41($($arg: $argty),*) -> $ret {
267                fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
268            }
269            #[target_feature(enable = "ssse3")]
270            unsafe fn impl_ssse3($($arg: $argty),*) -> $ret {
271                fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
272            }
273            #[target_feature(enable = "sse2")]
274            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
275                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
276            }
277            unsafe {
278                if is_x86_feature_detected!("avx2") {
279                    impl_avx2($($arg),*)
280                } else if is_x86_feature_detected!("avx") {
281                    impl_avx($($arg),*)
282                } else if is_x86_feature_detected!("sse4.1") {
283                    impl_sse41($($arg),*)
284                } else if is_x86_feature_detected!("ssse3") {
285                    impl_ssse3($($arg),*)
286                } else if is_x86_feature_detected!("sse2") {
287                    impl_sse2($($arg),*)
288                } else {
289                    unimplemented!()
290                }
291            }
292        }
293        #[cfg(not(feature = "std"))]
294        #[inline(always)]
295        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
296            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
297            unsafe {
298                if cfg!(target_feature = "avx2") {
299                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
300                } else if cfg!(target_feature = "avx") {
301                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
302                } else if cfg!(target_feature = "sse4.1") {
303                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
304                } else if cfg!(target_feature = "ssse3") {
305                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
306                } else {
307                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
308                }
309            }
310        }
311    };
312    ($mach:ident, $MTy:ident, { $([$pub:tt $(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
313        dispatch!($mach, $MTy, {
314            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
315        });
316    }
317}
318
319/// Generate only the basic implementations necessary to be able to operate efficiently on 128-bit
320/// vectors on this platfrom. For x86-64, that would mean SSE2 and AVX.
321///
322/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
323/// features (e.g. because they are done infrequently), so minimizing their contribution to code
324/// size is more important.
325#[macro_export]
326macro_rules! dispatch_light128 {
327    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
328        #[cfg(feature = "std")]
329        $($pub $(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
330            #[inline(always)]
331            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
332            use std::arch::x86_64::*;
333            #[target_feature(enable = "avx")]
334            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
335                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
336            }
337            #[target_feature(enable = "sse2")]
338            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
339                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
340            }
341            unsafe {
342                if is_x86_feature_detected!("avx") {
343                    impl_avx($($arg),*)
344                } else if is_x86_feature_detected!("sse2") {
345                    impl_sse2($($arg),*)
346                } else {
347                    unimplemented!()
348                }
349            }
350        }
351        #[cfg(not(feature = "std"))]
352        #[inline(always)]
353        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
354            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
355            unsafe {
356                if cfg!(target_feature = "avx2") {
357                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
358                } else if cfg!(target_feature = "avx") {
359                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
360                } else if cfg!(target_feature = "sse4.1") {
361                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
362                } else if cfg!(target_feature = "ssse3") {
363                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
364                } else {
365                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
366                }
367            }
368        }
369    };
370    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
371        dispatch_light128!($mach, $MTy, {
372            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
373        });
374    }
375}
376
377/// Generate only the basic implementations necessary to be able to operate efficiently on 256-bit
378/// vectors on this platfrom. For x86-64, that would mean SSE2, AVX, and AVX2.
379///
380/// This dispatcher is suitable for vector operations that do not benefit from advanced hardware
381/// features (e.g. because they are done infrequently), so minimizing their contribution to code
382/// size is more important.
383#[macro_export]
384macro_rules! dispatch_light256 {
385    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
386        #[cfg(feature = "std")]
387        $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> $ret {
388            #[inline(always)]
389            fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
390            use std::arch::x86_64::*;
391            #[target_feature(enable = "avx")]
392            unsafe fn impl_avx($($arg: $argty),*) -> $ret {
393                fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
394            }
395            #[target_feature(enable = "sse2")]
396            unsafe fn impl_sse2($($arg: $argty),*) -> $ret {
397                fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
398            }
399            unsafe {
400                if is_x86_feature_detected!("avx") {
401                    impl_avx($($arg),*)
402                } else if is_x86_feature_detected!("sse2") {
403                    impl_sse2($($arg),*)
404                } else {
405                    unimplemented!()
406                }
407            }
408        }
409        #[cfg(not(feature = "std"))]
410        #[inline(always)]
411        $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
412            unsafe fn fn_impl<$MTy: $crate::Machine>($mach: $MTy, $($arg: $argty),*) -> $ret $body
413            unsafe {
414                if cfg!(target_feature = "avx2") {
415                    fn_impl($crate::x86_64::AVX2::instance(), $($arg),*)
416                } else if cfg!(target_feature = "avx") {
417                    fn_impl($crate::x86_64::AVX::instance(), $($arg),*)
418                } else if cfg!(target_feature = "sse4.1") {
419                    fn_impl($crate::x86_64::SSE41::instance(), $($arg),*)
420                } else if cfg!(target_feature = "ssse3") {
421                    fn_impl($crate::x86_64::SSSE3::instance(), $($arg),*)
422                } else {
423                    fn_impl($crate::x86_64::SSE2::instance(), $($arg),*)
424                }
425            }
426        }
427    };
428    ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) $body:block }) => {
429        dispatch_light256!($mach, $MTy, {
430            $([$pub $(($krate))*])* fn $name($($arg: $argty),*) -> () $body
431        });
432    }
433}