Mercurial > games > semicongine
view semicongine/thirdparty/zippy/zippy/adler32_simd.nim @ 1123:657bb0b2af45
add: thirdparty libraries into repo
author | sam <sam@basx.dev> |
---|---|
date | Sat, 27 Apr 2024 22:04:30 +0700 |
parents | |
children |
line wrap: on
line source
import common # These functions are Nim conversions of an original implementation # from the Chromium repository. That implementation is: # # Copyright 2017 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the Chromium source repository LICENSE file. const nmax = 5552 blockSize = 32.uint32 when defined(amd64): when defined(gcc) or defined(clang): {.localPassC: "-mssse3".} {.push header: "emmintrin.h".} type M128i {.importc: "__m128i".} = object template MM_SHUFFLE(z, y, x, w: int | uint): int32 = ((z shl 6) or (y shl 4) or (x shl 2) or w).int32 func mm_loadu_si128(p: pointer): M128i {.importc: "_mm_loadu_si128".} func mm_setzero_si128(): M128i {.importc: "_mm_setzero_si128".} func mm_set_epi32(a, b, c, d: int32 | uint32): M128i {.importc: "_mm_set_epi32".} func mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p: int8 | uint8): M128i {.importc: "_mm_setr_epi8".} func mm_set1_epi16(a: int16 | uint16): M128i {.importc: "_mm_set1_epi16".} func mm_add_epi32(a, b: M128i): M128i {.importc: "_mm_add_epi32".} func mm_sad_epu8(a, b: M128i): M128i {.importc: "_mm_sad_epu8".} func mm_madd_epi16(a, b: M128i): M128i {.importc: "_mm_madd_epi16".} func mm_slli_epi32(a: M128i, imm8: int32 | uint32): M128i {.importc: "_mm_slli_epi32".} func mm_shuffle_epi32(a: M128i, imm8: int32 | uint32): M128i {.importc: "_mm_shuffle_epi32".} func mm_cvtsi128_si32(a: M128i): int32 {.importc: "_mm_cvtsi128_si32".} {.pop.} {.push header: "tmmintrin.h".} func mm_maddubs_epi16(a, b: M128i): M128i {.importc: "_mm_maddubs_epi16".} {.pop.} proc adler32_ssse3*(src: pointer, len: int): uint32 = if len == 0: return 1 if len < 0: raise newException(ZippyError, "Adler-32 len < 0") if len.uint64 > uint32.high: raise newException(ZippyError, "Adler-32 len > uint32.high") let src = cast[ptr UncheckedArray[uint8]](src) var pos: uint32 remaining = cast[uint32](len) s1 = 1.uint32 s2 = 0.uint32 var blocks = remaining div blockSize remaining -= (blocks * blockSize) let tap1 = mm_setr_epi8(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17) tap2 = mm_setr_epi8(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) zero = mm_setzero_si128() ones = mm_set1_epi16(1) while blocks > 0: var n = nmax div blockSize if n > blocks: n = blocks blocks -= n var vecPs = mm_set_epi32(0, 0, 0, s1 * n) vecS2 = mm_set_epi32(0, 0, 0, s2) vecS1 = mm_set_epi32(0, 0, 0, 0) while n > 0: let bytes1 = mm_loadu_si128(src[pos + 0].addr) bytes2 = mm_loadu_si128(src[pos + 16].addr) vecPs = mm_add_epi32(vecPs, vecS1) vecS1 = mm_add_epi32(vecS1, mm_sad_epu8(bytes1, zero)) let mad1 = mm_maddubs_epi16(bytes1, tap1) vecS2 = mm_add_epi32(vecS2, mm_madd_epi16(mad1, ones)) vecS1 = mm_add_epi32(vecS1, mm_sad_epu8(bytes2, zero)) let mad2 = mm_maddubs_epi16(bytes2, tap2) vecS2 = mm_add_epi32(vecS2, mm_madd_epi16(mad2, ones)) dec n pos += 32 vecS2 = mm_add_epi32(vecS2, mm_slli_epi32(vecPs, 5)) vecS1 = mm_add_epi32(vecS1, mm_shuffle_epi32(vecS1, MM_SHUFFLE(2, 3, 0, 1))) vecS1 = mm_add_epi32(vecS1, mm_shuffle_epi32(vecS1, MM_SHUFFLE(1, 0, 3, 2))) s1 += cast[uint32](mm_cvtsi128_si32(vecS1)) vecS2 = mm_add_epi32(vecS2, mm_shuffle_epi32(vecS2, MM_SHUFFLE(2, 3, 0, 1))) vecS2 = mm_add_epi32(vecS2, mm_shuffle_epi32(vecS2, MM_SHUFFLE(1, 0, 3, 2))) s2 = cast[uint32](mm_cvtsi128_si32(vecS2)) s1 = s1 mod 65521 s2 = s2 mod 65521 for i in 0 ..< remaining: s1 += src[pos + i] s2 += s1 s1 = s1 mod 65521 s2 = s2 mod 65521 result = (s2 shl 16) or s1 elif defined(arm64): {.push header: "arm_neon.h".} type uint8x16 {.importc: "uint8x16_t".} = object uint16x8 {.importc: "uint16x8_t".} = object uint32x4 {.importc: "uint32x4_t".} = object uint8x8 {.importc: "uint8x8_t".} = object uint16x4 {.importc: "uint16x4_t".} = object uint32x2 {.importc: "uint32x2_t".} = object func vmovq_n_u32(a: uint32): uint32x4 func vmovq_n_u16(a: uint16): uint16x8 func vld1q_u32(p: pointer): uint32x4 func vld1q_lane_u32(p: pointer, v: uint32x4, lane: int): uint32x4 func vld1q_u8(p: pointer): uint8x16 func vld1_u16(p: pointer): uint16x4 func vaddq_u32(a, b: uint32x4): uint32x4 func vpaddlq_u8(a: uint8x16): uint16x8 func vpadalq_u8(a: uint16x8, b: uint8x16): uint16x8 func vpadalq_u16(a: uint32x4, b: uint16x8): uint32x4 func vget_low_u8(a: uint8x16): uint8x8 func vget_high_u8(a: uint8x16): uint8x8 func vaddw_u8(a: uint16x8, b: uint8x8): uint16x8 func vshlq_n_u32(a: uint32x4, n: int): uint32x4 func vpadd_u32(a, b: uint32x2): uint32x2 func vget_low_u32(a: uint32x4): uint32x2 func vget_high_u32(a: uint32x4): uint32x2 func vget_lane_u32(a: uint32x2, lane: int): uint32 func vget_low_u16(a: uint16x8): uint16x4 func vget_high_u16(a: uint16x8): uint16x4 func vmlal_u16(a: uint32x4, b, c: uint16x4): uint32x4 {.pop.} proc adler32_neon*(src: pointer, len: int): uint32 = if len == 0: return 1 if len < 0: raise newException(ZippyError, "Adler-32 len < 0") if len.uint64 > uint32.high: raise newException(ZippyError, "Adler-32 len > uint32.high") let src = cast[ptr UncheckedArray[uint8]](src) var pos: uint32 remaining = cast[uint32](len) s1 = 1.uint32 s2 = 0.uint32 const blockSize = 32.uint32 var blocks = remaining div blockSize remaining -= (blocks * blockSize) var wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, wtf7, wtf8, wtf9: uint16x4 block: var tmp = [32.uint16, 31, 30, 29] wtf1 = vld1_u16(tmp.addr) tmp = [28.uint16, 27, 26, 25] wtf2 = vld1_u16(tmp.addr) tmp = [24.uint16, 23, 22, 21] wtf3 = vld1_u16(tmp.addr) tmp = [20.uint16, 19, 18, 17] wtf4 = vld1_u16(tmp.addr) tmp = [16.uint16, 15, 14, 13] wtf5 = vld1_u16(tmp.addr) tmp = [12.uint16, 11, 10, 9] wtf6 = vld1_u16(tmp.addr) tmp = [8.uint16, 7, 6, 5] wtf7 = vld1_u16(tmp.addr) tmp = [4.uint16, 3, 2, 1] wtf8 = vld1_u16(tmp.addr) while blocks > 0: var n = nmax div blockSize if n > blocks: n = blocks blocks -= n var vecS2 = vmovq_n_u32(0) vecS1 = vmovq_n_u32(0) vecColumnSum1 = vmovq_n_u16(0) vecColumnSum2 = vmovq_n_u16(0) vecColumnSum3 = vmovq_n_u16(0) vecColumnSum4 = vmovq_n_u16(0) block: var tmp = s1 * n vecS2 = vld1q_lane_u32(tmp.addr, vecS2, 0) while n > 0: let bytes1 = vld1q_u8(src[pos + 0].addr) bytes2 = vld1q_u8(src[pos + 16].addr) vecS2 = vaddq_u32(vecS2, vecS1) vecS1 = vpadalq_u16(vecS1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)) vecColumnSum1 = vaddw_u8(vecColumnSum1, vget_low_u8(bytes1)) vecColumnSum2 = vaddw_u8(vecColumnSum2, vget_high_u8(bytes1)) vecColumnSum3 = vaddw_u8(vecColumnSum3, vget_low_u8(bytes2)) vecColumnSum4 = vaddw_u8(vecColumnSum4, vget_high_u8(bytes2)) dec n pos += 32 vecS2 = vshlq_n_u32(vecS2, 5) vecS2 = vmlal_u16(vecS2, vget_low_u16(vecColumnSum1), wtf1) vecS2 = vmlal_u16(vecS2, vget_high_u16(vecColumnSum1), wtf2) vecS2 = vmlal_u16(vecS2, vget_low_u16(vecColumnSum2), wtf3) vecS2 = vmlal_u16(vecS2, vget_high_u16(vecColumnSum2), wtf4) vecS2 = vmlal_u16(vecS2, vget_low_u16(vecColumnSum3), wtf5) vecS2 = vmlal_u16(vecS2, vget_high_u16(vecColumnSum3), wtf6) vecS2 = vmlal_u16(vecS2, vget_low_u16(vecColumnSum4), wtf7) vecS2 = vmlal_u16(vecS2, vget_high_u16(vecColumnSum4), wtf8) let sum1 = vpadd_u32(vget_low_u32(vecS1), vget_high_u32(vecS1)) sum2 = vpadd_u32(vget_low_u32(vecS2), vget_high_u32(vecS2)) s1s2 = vpadd_u32(sum1, sum2) s1 += vget_lane_u32(s1s2, 0) s2 += vget_lane_u32(s1s2, 1) s1 = s1 mod 65521 s2 = s2 mod 65521 for i in 0 ..< remaining: s1 += src[pos + i] s2 += s1 s1 = s1 mod 65521 s2 = s2 mod 65521 result = (s2 shl 16) or s1