1 // Copyright 2008 The CRC32C Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
4
5 #include "./crc32c_sse42.h"
6
7 // In a separate source file to allow this accelerated CRC32C function to be
8 // compiled with the appropriate compiler flags to enable SSE4.2 instructions.
9
10 // This implementation is loosely based on Intel Pub 323405 from April 2011,
11 // "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction".
12
13 #include <cstddef>
14 #include <cstdint>
15
16 #include "./crc32c_internal.h"
17 #include "./crc32c_prefetch.h"
18 #include "./crc32c_read_le.h"
19 #include "./crc32c_round_up.h"
20 #include "crc32c/crc32c_config.h"
21
22 #if HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
23
24 #if defined(_MSC_VER)
25 #include <intrin.h>
26 #else // !defined(_MSC_VER)
27 #include <nmmintrin.h>
28 #endif // defined(_MSC_VER)
29
30 namespace crc32c {
31
32 namespace {
33
34 constexpr const ptrdiff_t kGroups = 3;
35 constexpr const ptrdiff_t kBlock0Size = 16 * 1024 / kGroups / 64 * 64;
36 constexpr const ptrdiff_t kBlock1Size = 4 * 1024 / kGroups / 8 * 8;
37 constexpr const ptrdiff_t kBlock2Size = 1024 / kGroups / 8 * 8;
38
39 const uint32_t kBlock0SkipTable[8][16] = {
40 {0x00000000, 0xff770459, 0xfb027e43, 0x04757a1a, 0xf3e88a77, 0x0c9f8e2e,
41 0x08eaf434, 0xf79df06d, 0xe23d621f, 0x1d4a6646, 0x193f1c5c, 0xe6481805,
42 0x11d5e868, 0xeea2ec31, 0xead7962b, 0x15a09272},
43 {0x00000000, 0xc196b2cf, 0x86c1136f, 0x4757a1a0, 0x086e502f, 0xc9f8e2e0,
44 0x8eaf4340, 0x4f39f18f, 0x10dca05e, 0xd14a1291, 0x961db331, 0x578b01fe,
45 0x18b2f071, 0xd92442be, 0x9e73e31e, 0x5fe551d1},
46 {0x00000000, 0x21b940bc, 0x43728178, 0x62cbc1c4, 0x86e502f0, 0xa75c424c,
47 0xc5978388, 0xe42ec334, 0x08267311, 0x299f33ad, 0x4b54f269, 0x6aedb2d5,
48 0x8ec371e1, 0xaf7a315d, 0xcdb1f099, 0xec08b025},
49 {0x00000000, 0x104ce622, 0x2099cc44, 0x30d52a66, 0x41339888, 0x517f7eaa,
50 0x61aa54cc, 0x71e6b2ee, 0x82673110, 0x922bd732, 0xa2fefd54, 0xb2b21b76,
51 0xc354a998, 0xd3184fba, 0xe3cd65dc, 0xf38183fe},
52 {0x00000000, 0x012214d1, 0x024429a2, 0x03663d73, 0x04885344, 0x05aa4795,
53 0x06cc7ae6, 0x07ee6e37, 0x0910a688, 0x0832b259, 0x0b548f2a, 0x0a769bfb,
54 0x0d98f5cc, 0x0cbae11d, 0x0fdcdc6e, 0x0efec8bf},
55 {0x00000000, 0x12214d10, 0x24429a20, 0x3663d730, 0x48853440, 0x5aa47950,
56 0x6cc7ae60, 0x7ee6e370, 0x910a6880, 0x832b2590, 0xb548f2a0, 0xa769bfb0,
57 0xd98f5cc0, 0xcbae11d0, 0xfdcdc6e0, 0xefec8bf0},
58 {0x00000000, 0x27f8a7f1, 0x4ff14fe2, 0x6809e813, 0x9fe29fc4, 0xb81a3835,
59 0xd013d026, 0xf7eb77d7, 0x3a294979, 0x1dd1ee88, 0x75d8069b, 0x5220a16a,
60 0xa5cbd6bd, 0x8233714c, 0xea3a995f, 0xcdc23eae},
61 {0x00000000, 0x745292f2, 0xe8a525e4, 0x9cf7b716, 0xd4a63d39, 0xa0f4afcb,
62 0x3c0318dd, 0x48518a2f, 0xaca00c83, 0xd8f29e71, 0x44052967, 0x3057bb95,
63 0x780631ba, 0x0c54a348, 0x90a3145e, 0xe4f186ac},
64 };
65 const uint32_t kBlock1SkipTable[8][16] = {
66 {0x00000000, 0x79113270, 0xf22264e0, 0x8b335690, 0xe1a8bf31, 0x98b98d41,
67 0x138adbd1, 0x6a9be9a1, 0xc6bd0893, 0xbfac3ae3, 0x349f6c73, 0x4d8e5e03,
68 0x2715b7a2, 0x5e0485d2, 0xd537d342, 0xac26e132},
69 {0x00000000, 0x889667d7, 0x14c0b95f, 0x9c56de88, 0x298172be, 0xa1171569,
70 0x3d41cbe1, 0xb5d7ac36, 0x5302e57c, 0xdb9482ab, 0x47c25c23, 0xcf543bf4,
71 0x7a8397c2, 0xf215f015, 0x6e432e9d, 0xe6d5494a},
72 {0x00000000, 0xa605caf8, 0x49e7e301, 0xefe229f9, 0x93cfc602, 0x35ca0cfa,
73 0xda282503, 0x7c2deffb, 0x2273faf5, 0x8476300d, 0x6b9419f4, 0xcd91d30c,
74 0xb1bc3cf7, 0x17b9f60f, 0xf85bdff6, 0x5e5e150e},
75 {0x00000000, 0x44e7f5ea, 0x89cfebd4, 0xcd281e3e, 0x1673a159, 0x529454b3,
76 0x9fbc4a8d, 0xdb5bbf67, 0x2ce742b2, 0x6800b758, 0xa528a966, 0xe1cf5c8c,
77 0x3a94e3eb, 0x7e731601, 0xb35b083f, 0xf7bcfdd5},
78 {0x00000000, 0x59ce8564, 0xb39d0ac8, 0xea538fac, 0x62d66361, 0x3b18e605,
79 0xd14b69a9, 0x8885eccd, 0xc5acc6c2, 0x9c6243a6, 0x7631cc0a, 0x2fff496e,
80 0xa77aa5a3, 0xfeb420c7, 0x14e7af6b, 0x4d292a0f},
81 {0x00000000, 0x8eb5fb75, 0x1887801b, 0x96327b6e, 0x310f0036, 0xbfbafb43,
82 0x2988802d, 0xa73d7b58, 0x621e006c, 0xecabfb19, 0x7a998077, 0xf42c7b02,
83 0x5311005a, 0xdda4fb2f, 0x4b968041, 0xc5237b34},
84 {0x00000000, 0xc43c00d8, 0x8d947741, 0x49a87799, 0x1ec49873, 0xdaf898ab,
85 0x9350ef32, 0x576cefea, 0x3d8930e6, 0xf9b5303e, 0xb01d47a7, 0x7421477f,
86 0x234da895, 0xe771a84d, 0xaed9dfd4, 0x6ae5df0c},
87 {0x00000000, 0x7b1261cc, 0xf624c398, 0x8d36a254, 0xe9a5f1c1, 0x92b7900d,
88 0x1f813259, 0x64935395, 0xd6a79573, 0xadb5f4bf, 0x208356eb, 0x5b913727,
89 0x3f0264b2, 0x4410057e, 0xc926a72a, 0xb234c6e6},
90 };
91 const uint32_t kBlock2SkipTable[8][16] = {
92 {0x00000000, 0x8f158014, 0x1bc776d9, 0x94d2f6cd, 0x378eedb2, 0xb89b6da6,
93 0x2c499b6b, 0xa35c1b7f, 0x6f1ddb64, 0xe0085b70, 0x74daadbd, 0xfbcf2da9,
94 0x589336d6, 0xd786b6c2, 0x4354400f, 0xcc41c01b},
95 {0x00000000, 0xde3bb6c8, 0xb99b1b61, 0x67a0ada9, 0x76da4033, 0xa8e1f6fb,
96 0xcf415b52, 0x117aed9a, 0xedb48066, 0x338f36ae, 0x542f9b07, 0x8a142dcf,
97 0x9b6ec055, 0x4555769d, 0x22f5db34, 0xfcce6dfc},
98 {0x00000000, 0xde85763d, 0xb8e69a8b, 0x6663ecb6, 0x742143e7, 0xaaa435da,
99 0xccc7d96c, 0x1242af51, 0xe84287ce, 0x36c7f1f3, 0x50a41d45, 0x8e216b78,
100 0x9c63c429, 0x42e6b214, 0x24855ea2, 0xfa00289f},
101 {0x00000000, 0xd569796d, 0xaf3e842b, 0x7a57fd46, 0x5b917ea7, 0x8ef807ca,
102 0xf4affa8c, 0x21c683e1, 0xb722fd4e, 0x624b8423, 0x181c7965, 0xcd750008,
103 0xecb383e9, 0x39dafa84, 0x438d07c2, 0x96e47eaf},
104 {0x00000000, 0x6ba98c6d, 0xd75318da, 0xbcfa94b7, 0xab4a4745, 0xc0e3cb28,
105 0x7c195f9f, 0x17b0d3f2, 0x5378f87b, 0x38d17416, 0x842be0a1, 0xef826ccc,
106 0xf832bf3e, 0x939b3353, 0x2f61a7e4, 0x44c82b89},
107 {0x00000000, 0xa6f1f0f6, 0x480f971d, 0xeefe67eb, 0x901f2e3a, 0x36eedecc,
108 0xd810b927, 0x7ee149d1, 0x25d22a85, 0x8323da73, 0x6dddbd98, 0xcb2c4d6e,
109 0xb5cd04bf, 0x133cf449, 0xfdc293a2, 0x5b336354},
110 {0x00000000, 0x4ba4550a, 0x9748aa14, 0xdcecff1e, 0x2b7d22d9, 0x60d977d3,
111 0xbc3588cd, 0xf791ddc7, 0x56fa45b2, 0x1d5e10b8, 0xc1b2efa6, 0x8a16baac,
112 0x7d87676b, 0x36233261, 0xeacfcd7f, 0xa16b9875},
113 {0x00000000, 0xadf48b64, 0x5e056039, 0xf3f1eb5d, 0xbc0ac072, 0x11fe4b16,
114 0xe20fa04b, 0x4ffb2b2f, 0x7df9f615, 0xd00d7d71, 0x23fc962c, 0x8e081d48,
115 0xc1f33667, 0x6c07bd03, 0x9ff6565e, 0x3202dd3a},
116 };
117
118 constexpr const ptrdiff_t kPrefetchHorizon = 256;
119
120 } // namespace
121
ExtendSse42(uint32_t crc,const uint8_t * data,size_t size)122 uint32_t ExtendSse42(uint32_t crc, const uint8_t* data, size_t size) {
123 const uint8_t* p = data;
124 const uint8_t* e = data + size;
125 uint32_t l = crc ^ kCRC32Xor;
126
127 #define STEP1 \
128 do { \
129 l = _mm_crc32_u8(l, *p++); \
130 } while (0)
131
132 #define STEP4(crc) \
133 do { \
134 crc = _mm_crc32_u32(crc, ReadUint32LE(p)); \
135 p += 4; \
136 } while (0)
137
138 #define STEP8(crc, data) \
139 do { \
140 crc = _mm_crc32_u64(crc, ReadUint64LE(data)); \
141 data += 8; \
142 } while (0)
143
144 #define STEP8BY3(crc0, crc1, crc2, p0, p1, p2) \
145 do { \
146 STEP8(crc0, p0); \
147 STEP8(crc1, p1); \
148 STEP8(crc2, p2); \
149 } while (0)
150
151 #define STEP8X3(crc0, crc1, crc2, bs) \
152 do { \
153 crc0 = _mm_crc32_u64(crc0, ReadUint64LE(p)); \
154 crc1 = _mm_crc32_u64(crc1, ReadUint64LE(p + bs)); \
155 crc2 = _mm_crc32_u64(crc2, ReadUint64LE(p + 2 * bs)); \
156 p += 8; \
157 } while (0)
158
159 #define SKIP_BLOCK(crc, tab) \
160 do { \
161 crc = tab[0][crc & 0xf] ^ tab[1][(crc >> 4) & 0xf] ^ \
162 tab[2][(crc >> 8) & 0xf] ^ tab[3][(crc >> 12) & 0xf] ^ \
163 tab[4][(crc >> 16) & 0xf] ^ tab[5][(crc >> 20) & 0xf] ^ \
164 tab[6][(crc >> 24) & 0xf] ^ tab[7][(crc >> 28) & 0xf]; \
165 } while (0)
166
167 // Point x at first 8-byte aligned byte in the buffer. This might be past the
168 // end of the buffer.
169 const uint8_t* x = RoundUp<8>(p);
170 if (x <= e) {
171 // Process bytes p is 8-byte aligned.
172 while (p != x) {
173 STEP1;
174 }
175 }
176
177 // Proccess the data in predetermined block sizes with tables for quickly
178 // combining the checksum. Experimentally it's better to use larger block
179 // sizes where possible so use a hierarchy of decreasing block sizes.
180 uint64_t l64 = l;
181 while ((e - p) >= kGroups * kBlock0Size) {
182 uint64_t l641 = 0;
183 uint64_t l642 = 0;
184 for (int i = 0; i < kBlock0Size; i += 8 * 8) {
185 // Prefetch ahead to hide latency.
186 RequestPrefetch(p + kPrefetchHorizon);
187 RequestPrefetch(p + kBlock0Size + kPrefetchHorizon);
188 RequestPrefetch(p + 2 * kBlock0Size + kPrefetchHorizon);
189
190 // Process 64 bytes at a time.
191 STEP8X3(l64, l641, l642, kBlock0Size);
192 STEP8X3(l64, l641, l642, kBlock0Size);
193 STEP8X3(l64, l641, l642, kBlock0Size);
194 STEP8X3(l64, l641, l642, kBlock0Size);
195 STEP8X3(l64, l641, l642, kBlock0Size);
196 STEP8X3(l64, l641, l642, kBlock0Size);
197 STEP8X3(l64, l641, l642, kBlock0Size);
198 STEP8X3(l64, l641, l642, kBlock0Size);
199 }
200
201 // Combine results.
202 SKIP_BLOCK(l64, kBlock0SkipTable);
203 l64 ^= l641;
204 SKIP_BLOCK(l64, kBlock0SkipTable);
205 l64 ^= l642;
206 p += (kGroups - 1) * kBlock0Size;
207 }
208 while ((e - p) >= kGroups * kBlock1Size) {
209 uint64_t l641 = 0;
210 uint64_t l642 = 0;
211 for (int i = 0; i < kBlock1Size; i += 8) {
212 STEP8X3(l64, l641, l642, kBlock1Size);
213 }
214 SKIP_BLOCK(l64, kBlock1SkipTable);
215 l64 ^= l641;
216 SKIP_BLOCK(l64, kBlock1SkipTable);
217 l64 ^= l642;
218 p += (kGroups - 1) * kBlock1Size;
219 }
220 while ((e - p) >= kGroups * kBlock2Size) {
221 uint64_t l641 = 0;
222 uint64_t l642 = 0;
223 for (int i = 0; i < kBlock2Size; i += 8) {
224 STEP8X3(l64, l641, l642, kBlock2Size);
225 }
226 SKIP_BLOCK(l64, kBlock2SkipTable);
227 l64 ^= l641;
228 SKIP_BLOCK(l64, kBlock2SkipTable);
229 l64 ^= l642;
230 p += (kGroups - 1) * kBlock2Size;
231 }
232
233 // Process bytes 16 at a time
234 while ((e - p) >= 16) {
235 STEP8(l64, p);
236 STEP8(l64, p);
237 }
238
239 l = static_cast<uint32_t>(l64);
240 // Process the last few bytes.
241 while (p != e) {
242 STEP1;
243 }
244 #undef SKIP_BLOCK
245 #undef STEP8X3
246 #undef STEP8BY3
247 #undef STEP8
248 #undef STEP4
249 #undef STEP1
250
251 return l ^ kCRC32Xor;
252 }
253
254 } // namespace crc32c
255
256 #endif // HAVE_SSE42 && (defined(_M_X64) || defined(__x86_64__))
257