c++ - 如何在 SSE2 中为 8 位和 16 位整数实现 vector 右移和左移?

我来访问这个post在为我的下一个项目做研究时。能够使用 SIMD 通过 vector 对 8 位和 16 位整数进行位移位对我来说非常有用,我想这里还有很多其他人。

对我来说不幸的是,我的项目将在其上运行的平台最多具有 SSE2 功能。

交换

 _mm256_*** 

 _mm_*** 

不会把它剪成

 _mm_shuffle_epi8() //Requires SSSE3 
 _mm_blendv_epi8()  //Requires SSE4.1
 _mm_blend_epi16()  //Requires SSE4.1
 _mm_sllv_epi32()   //Requires AVX2

所以你看到了我的困境。仅使用 SSE2 可能无法实现,但如果证明是错误的,我会非常高兴(坦率地说,我会感到惊讶)。

提前致谢。

最佳答案

这不是最好的代码,我真的不能说它比将每个元素作为 uint16 处理更好还是更差。如果您确保位移量始终小于 16,您可以节省一些操作,但它仍然不是很好。

__m128i sllv_epi16(__m128i v, __m128i s) {

    // test each bit I the shift
    const __m128i _1  = _mm_set1_epi16(1);
    const __m128i _2  = _mm_set1_epi16(2);
    const __m128i _4  = _mm_set1_epi16(4);
    const __m128i _8  = _mm_set1_epi16(8);

    // testing to set to zero if 16 or greater
    const __m128i _16 = _mm_set1_epi16(16);
    s = _mm_min_epi16(s, _16);

    // mask out each bit in the shift amount
    __m128i cmp1  = _mm_and_si128(s, _1);
    __m128i cmp2  = _mm_and_si128(s, _2);
    __m128i cmp4  = _mm_and_si128(s, _4);
    __m128i cmp8  = _mm_and_si128(s, _8);
    __m128i cmp16 = _mm_cmpeq_epi16(_16, s);

    // convert each bit into a true/false mask
    cmp1 = _mm_cmpeq_epi16(_1, cmp1);
    cmp2 = _mm_cmpeq_epi16(_2, cmp2);
    cmp4 = _mm_cmpeq_epi16(_4, cmp4);
    cmp8 = _mm_cmpeq_epi16(_8, cmp8);

    // shift by 1 bit, select result
    __m128i shift1 = _mm_slli_epi16(v, 1);
    v = _mm_or_si128(_mm_andnot_si128(cmp1, v), 
                     _mm_and_si128(cmp1, shift1));

    // shift by 2 bits, select result
    __m128i shift2 = _mm_slli_epi16(v, 2);
    v = _mm_or_si128(_mm_andnot_si128(cmp2, v),
                     _mm_and_si128(cmp2, shift2));

    // shift by 4 bits, select result
    __m128i shift4 = _mm_slli_epi16(v, 4);
    v = _mm_or_si128(_mm_andnot_si128(cmp4, v),
                     _mm_and_si128(cmp4, shift4));

    // shift by 8 bits, select result
    __m128i shift8 = _mm_slli_epi16(v, 8);
    v = _mm_or_si128(_mm_andnot_si128(cmp8, v),
                     _mm_and_si128(cmp8, shift8));

    // filter out shifts >= 16.
    return _mm_andnot_si128(cmp16, v); 
}

对于 8 位

__m128i sllv_epi8(__m128i v, __m128i s) {
    
    const __m128i _1 = _mm_set1_epi8(1);
    const __m128i _2 = _mm_set1_epi8(2);
    const __m128i _4 = _mm_set1_epi8(4);
    const __m128i _8 = _mm_set1_epi8(8);
    s = _mm_min_epu8(s, _8);

    __m128i cmp1 = _mm_and_si128(s, _1);
    __m128i cmp2 = _mm_and_si128(s, _2);
    __m128i cmp4 = _mm_and_si128(s, _4);
    __m128i cmp8 = _mm_cmpeq_epi8(_8, s);

    cmp1 = _mm_cmpeq_epi8(_1, cmp1);
    cmp2 = _mm_cmpeq_epi8(_2, cmp2);
    cmp4 = _mm_cmpeq_epi8(_4, cmp4);

    __m128i shift1 = _mm_slli_epi16( _mm_and_si128(v, _mm_set1_epi8(0x7F)), 1);
    v = _mm_or_si128(_mm_andnot_si128(cmp1, v), 
                     _mm_and_si128(cmp1, shift1));

    __m128i shift2 = _mm_slli_epi16(_mm_and_si128(v, _mm_set1_epi8(0x3F)), 2);
    v = _mm_or_si128(_mm_andnot_si128(cmp2, v),
                     _mm_and_si128(cmp2, shift2));

    __m128i shift4 = _mm_slli_epi16(_mm_and_si128(v, _mm_set1_epi8(0x0F)), 4);
    v = _mm_or_si128(_mm_andnot_si128(cmp4, v),
                     _mm_and_si128(cmp4, shift4));

    return _mm_andnot_si128(cmp8, v); 
}

https://stackoverflow.com/questions/74050370/

相关文章:

awk - 使用 awk 将字段分隔符 ',' 更改为空格

r - 如何在组内创建字母序列?

javascript - 警告 IE11 用户他们的浏览器在 React 18 中不受支持

c# - 在 C# 中将小驼峰命名法转换为 PascalCase?

python - 如何避免 NumPy 中的嵌套 for 循环?

c - 使用个性系统调用使堆栈可执行

regex - 删除换行符后跟多个选项卡正则表达式

ruby - 如何在 Ruby 中创建可调用属性

delphi - TFileStream 比 TMemoryStream 慢

c++ - 你能用折叠表达式实现 fn(x1 ^ fn(x0)) 吗?