Fwd: Questions with the in_cksumdata() function in sys/amd64/amd64/in_cksum.c
Tiwei Bie
btw at mail.ustc.edu.cn
Mon Oct 20 10:35:26 UTC 2014
> > > I would be not surprised if this manual prefetching by explicit reads
> > > causes slowdown of the function. I suspect it could confuse hardware
> > > prefetcher by breaking the linear pattern, or the patch could break
> > > the logic of the limited forward-looking oracle by reading too far
> > > from the current linear read tip.
> > >
> > > Also, it could confuse the data flow engine if the register allocator
> > > is unable to see that the read value is needed not right now, and cause
> > > unneeded stall while next cache line is fetched.
> > >
> > > Sure, all my speculations are pure garbage until confirmed by
> > > measurements with pmc, but I think that the patch below must be
> > > benchmarked to confirm any value it provides as well. My opinion is,
> > > we should either remove the manual prefetch, or do it with PREFETCHLX
> > > instructions only, instead of real read.
> >
> > I have done a rather simple test. And the results are listed as follows:
> >
> Yes, too simple to draw conclusion, IMO.
>
> Please look at the ministat(1). I think that the test run length
> is too short to come with any decisions. The length x 3 runs does not
> give enough confidence; but ministat would provide the numbers to judge.
I have run ministat with these results, and the following is the output of
ministat. It said that the difference is at 99.5% confidence.
$ ministat -w 80 -s -c 99.5 32BYTES_WITH_PRE_READ 32BYTES_WITHOUT_MANUAL_PREFETCH 64BYTES_WITH_PRE_READ 64BYTES_WITHOUT_MANUAL_PREFETCH 64BYTES_WITH_PREFETCH_INSTRUCTION
x 32BYTES_WITH_PRE_READ
+ 32BYTES_WITHOUT_MANUAL_PREFETCH
* 64BYTES_WITH_PRE_READ
% 64BYTES_WITHOUT_MANUAL_PREFETCH
# 64BYTES_WITH_PREFETCH_INSTRUCTION
+--------------------------------------------------------------------------------+
|% # |
|% *** # # ++ + xx x|
| |_MA__||
| |A_| |
| |A| |
|A |
| |MA_| |
+--------------------------------------------------------------------------------+
N Min Max Median Avg Stddev
x 3 0.768854 0.773803 0.770332 0.77099633 0.0025405028
+ 3 0.725956 0.728841 0.726651 0.72714933 0.0015056754
Difference at 99.5% confidence
-0.043847 +/- 0.0122301
-5.68706% +/- 1.58627%
(Student's t, pooled s = 0.00208821)
* 3 0.702416 0.704498 0.703648 0.70352067 0.0010468244
Difference at 99.5% confidence
-0.0674757 +/- 0.0113792
-8.75175% +/- 1.47591%
(Student's t, pooled s = 0.00194294)
% 3 0.697971 0.698314 0.698071 0.69811867 0.00017639822
Difference at 99.5% confidence
-0.0728777 +/- 0.0105464
-9.4524% +/- 1.36789%
(Student's t, pooled s = 0.00180073)
# 3 0.711962 0.715883 0.712118 0.713321 0.0022201277
Difference at 99.5% confidence
-0.0576753 +/- 0.0139724
-7.48062% +/- 1.81225%
(Student's t, pooled s = 0.0023857)
> > #1. Read 32 bytes with manual pre-read in each loop:
> >
> > $ cc main.c -D_32BYTES_WITH_PRE_READ
> > $ for i in `seq 3`; do ./a.out; done
> > 0.768854
> > 0.770332
> > 0.773803
> >
> > #2. Read 64 bytes with manual pre-read in each loop:
> >
> > $ cc main.c -D_64BYTES_WITH_PRE_READ
> > $ for i in `seq 3`; do ./a.out; done
> > 0.702416
> > 0.703648
> > 0.704498
> >
> > #3. Read 32 bytes without manual prefetch in each loop:
> >
> > $ cc main.c -D_32BYTES_WITHOUT_MANUAL_PREFETCH
> > $ for i in `seq 3`; do ./a.out; done
> > 0.726651
> > 0.728841
> > 0.725956
> >
> > #4. Read 64 bytes without manual prefetch in each loop:
> >
> > $ cc main.c -D_64BYTES_WITHOUT_MANUAL_PREFETCH
> > $ for i in `seq 3`; do ./a.out; done
> > 0.698071
> > 0.697971
> > 0.698314
> >
> > #5. Read 64 bytes with PREFETCH instruction:
> >
> > $ cc main.c -D_64BYTES_WITH_PREFETCH_INSTRUCTION
> > $ for i in `seq 3`; do ./a.out; done
> > 0.715883
> > 0.712118
> > 0.711962
> >
> > The test is very simple. I just run the in_cksumdata() function on one
> > million packets. And the result is the time spent on calculating these
> > checksums.
> >
> > As we can see from the results, when reading 64 bytes data without manual
> > prefetch operation in each loop, the speed is fastest. So, I think read
> > a whole cache line in each loop is helpful.
> >
> > ---
> >
> > The computer that I run the test program on:
> >
> > $ dmesg | grep CPU:
> > CPU: Intel(R) Core(TM) i5-2400 CPU @ 3.10GHz (3093.03-MHz K8-class CPU)
> >
> > ---
> >
> > The test program:
> >
> > #include <stdio.h>
> > #include <sys/types.h>
> > #include <sys/time.h>
> >
> > /* ------------------------------------------------------------------------ */
> >
> > #define PACKET_SIZE 1500
> > #define BUFFER_SIZE ((PACKET_SIZE) << 20)
> > static unsigned char buffer[BUFFER_SIZE];
> >
> > /* ------------------------------------------------------------------------ */
> >
> > /*
> > * Checksum routine for Internet Protocol family headers
> > * (Portable Alpha version).
> > *
> > * This routine is very heavily used in the network
> > * code and should be modified for each CPU to be as fast as possible.
> > */
> >
> > #define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
> > #define REDUCE32 \
> > { \
> > q_util.q = sum; \
> > sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
> > }
> > #define REDUCE16 \
> > { \
> > q_util.q = sum; \
> > l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
> > sum = l_util.s[0] + l_util.s[1]; \
> > ADDCARRY(sum); \
> > }
> >
> > static const u_int32_t in_masks[] = {
> > /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/
> > 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */
> > 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */
> > 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */
> > 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */
> > };
> >
> > union l_util {
> > u_int16_t s[2];
> > u_int32_t l;
> > };
> > union q_util {
> > u_int16_t s[4];
> > u_int32_t l[2];
> > u_int64_t q;
> > };
> >
> > /* ------------------------------------------------------------------------ */
> >
> > //#define _32BYTES_WITH_PRE_READ
> > //#define _64BYTES_WITH_PRE_READ
> > //#define _32BYTES_WITHOUT_MANUAL_PREFETCH
> > //#define _64BYTES_WITHOUT_MANUAL_PREFETCH
> > //#define _64BYTES_WITH_PREFETCH_INSTRUCTION
> >
> > #ifdef _32BYTES_WITH_PRE_READ
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > const u_int32_t *lw = (const u_int32_t *) buf;
> > u_int64_t sum = 0;
> > u_int64_t prefilled;
> > int offset;
> > union q_util q_util;
> >
> > if ((3 & (long) lw) == 0 && len == 20) {
> > sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > REDUCE32;
> > return sum;
> > }
> >
> > if ((offset = 3 & (long) lw) != 0) {
> > const u_int32_t *masks = in_masks + (offset << 2);
> > lw = (u_int32_t *) (((long) lw) - offset);
> > sum = *lw++ & masks[len >= 3 ? 3 : len];
> > len -= 4 - offset;
> > if (len <= 0) {
> > REDUCE32;
> > return sum;
> > }
> > }
> > #if 0
> > /*
> > * Force to cache line boundary.
> > */
> > offset = 32 - (0x1f & (long) lw);
> > if (offset < 32 && len > offset) {
> > len -= offset;
> > if (4 & offset) {
> > sum += (u_int64_t) lw[0];
> > lw += 1;
> > }
> > if (8 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1];
> > lw += 2;
> > }
> > if (16 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > }
> > #endif
> > /*
> > * access prefilling to start load of next cache line.
> > * then add current cache line
> > * save result of prefilling for loop iteration.
> > */
> > prefilled = lw[0];
> > while ((len -= 32) >= 4) {
> > u_int64_t prefilling = lw[8];
> > sum += prefilled + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7];
> > lw += 8;
> > prefilled = prefilling;
> > }
> > if (len >= 0) {
> > sum += prefilled + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7];
> > lw += 8;
> > } else {
> > len += 32;
> > }
> > while ((len -= 16) >= 0) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > len += 16;
> > while ((len -= 4) >= 0) {
> > sum += (u_int64_t) *lw++;
> > }
> > len += 4;
> > if (len > 0)
> > sum += (u_int64_t) (in_masks[len] & *lw);
> > REDUCE32;
> > return sum;
> > }
> > #endif
> >
> > #ifdef _64BYTES_WITH_PRE_READ
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > const u_int32_t *lw = (const u_int32_t *) buf;
> > u_int64_t sum = 0;
> > u_int64_t prefilled;
> > int offset;
> > union q_util q_util;
> >
> > if ((3 & (long) lw) == 0 && len == 20) {
> > sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > REDUCE32;
> > return sum;
> > }
> >
> > if ((offset = 3 & (long) lw) != 0) {
> > const u_int32_t *masks = in_masks + (offset << 2);
> > lw = (u_int32_t *) (((long) lw) - offset);
> > sum = *lw++ & masks[len >= 3 ? 3 : len];
> > len -= 4 - offset;
> > if (len <= 0) {
> > REDUCE32;
> > return sum;
> > }
> > }
> > #if 0
> > /*
> > * Force to cache line boundary.
> > */
> > offset = 32 - (0x1f & (long) lw);
> > if (offset < 32 && len > offset) {
> > len -= offset;
> > if (4 & offset) {
> > sum += (u_int64_t) lw[0];
> > lw += 1;
> > }
> > if (8 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1];
> > lw += 2;
> > }
> > if (16 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > }
> > #endif
> > /*
> > * access prefilling to start load of next cache line.
> > * then add current cache line
> > * save result of prefilling for loop iteration.
> > */
> > prefilled = lw[0];
> > while ((len -= 64) >= 4) {
> > u_int64_t prefilling = lw[16];
> > sum += prefilled + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7]
> > + lw[8] + lw[9] + lw[10] + lw[11]
> > + lw[12] + lw[13] + lw[14] + lw[15];
> > lw += 16;
> > prefilled = prefilling;
> > }
> > if (len >= 0) {
> > sum += prefilled + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7]
> > + lw[8] + lw[9] + lw[10] + lw[11]
> > + lw[12] + lw[13] + lw[14] + lw[15];
> > lw += 16;
> > } else {
> > len += 64;
> > }
> > while ((len -= 16) >= 0) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > len += 16;
> > while ((len -= 4) >= 0) {
> > sum += (u_int64_t) *lw++;
> > }
> > len += 4;
> > if (len > 0)
> > sum += (u_int64_t) (in_masks[len] & *lw);
> > REDUCE32;
> > return sum;
> > }
> > #endif
> >
> > #ifdef _32BYTES_WITHOUT_MANUAL_PREFETCH
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > const u_int32_t *lw = (const u_int32_t *) buf;
> > u_int64_t sum = 0;
> > int offset;
> > union q_util q_util;
> >
> > if ((3 & (long) lw) == 0 && len == 20) {
> > sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > REDUCE32;
> > return sum;
> > }
> >
> > if ((offset = 3 & (long) lw) != 0) {
> > const u_int32_t *masks = in_masks + (offset << 2);
> > lw = (u_int32_t *) (((long) lw) - offset);
> > sum = *lw++ & masks[len >= 3 ? 3 : len];
> > len -= 4 - offset;
> > if (len <= 0) {
> > REDUCE32;
> > return sum;
> > }
> > }
> > #if 0
> > /*
> > * Force to cache line boundary.
> > */
> > offset = 32 - (0x1f & (long) lw);
> > if (offset < 32 && len > offset) {
> > len -= offset;
> > if (4 & offset) {
> > sum += (u_int64_t) lw[0];
> > lw += 1;
> > }
> > if (8 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1];
> > lw += 2;
> > }
> > if (16 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > }
> > #endif
> > /*
> > * access prefilling to start load of next cache line.
> > * then add current cache line
> > * save result of prefilling for loop iteration.
> > */
> > while ((len -= 32) >= 4) {
> > sum += lw[0] + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7];
> > lw += 8;
> > }
> > if (len >= 0) {
> > sum += lw[0] + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7];
> > lw += 8;
> > } else {
> > len += 32;
> > }
> > while ((len -= 16) >= 0) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > len += 16;
> > while ((len -= 4) >= 0) {
> > sum += (u_int64_t) *lw++;
> > }
> > len += 4;
> > if (len > 0)
> > sum += (u_int64_t) (in_masks[len] & *lw);
> > REDUCE32;
> > return sum;
> > }
> > #endif
> >
> > #ifdef _64BYTES_WITHOUT_MANUAL_PREFETCH
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > const u_int32_t *lw = (const u_int32_t *) buf;
> > u_int64_t sum = 0;
> > int offset;
> > union q_util q_util;
> >
> > if ((3 & (long) lw) == 0 && len == 20) {
> > sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > REDUCE32;
> > return sum;
> > }
> >
> > if ((offset = 3 & (long) lw) != 0) {
> > const u_int32_t *masks = in_masks + (offset << 2);
> > lw = (u_int32_t *) (((long) lw) - offset);
> > sum = *lw++ & masks[len >= 3 ? 3 : len];
> > len -= 4 - offset;
> > if (len <= 0) {
> > REDUCE32;
> > return sum;
> > }
> > }
> > #if 0
> > /*
> > * Force to cache line boundary.
> > */
> > offset = 32 - (0x1f & (long) lw);
> > if (offset < 32 && len > offset) {
> > len -= offset;
> > if (4 & offset) {
> > sum += (u_int64_t) lw[0];
> > lw += 1;
> > }
> > if (8 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1];
> > lw += 2;
> > }
> > if (16 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > }
> > #endif
> > /*
> > * access prefilling to start load of next cache line.
> > * then add current cache line
> > * save result of prefilling for loop iteration.
> > */
> > while ((len -= 64) >= 4) {
> > sum += lw[0] + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7]
> > + lw[8] + lw[9] + lw[10] + lw[11]
> > + lw[12] + lw[13] + lw[14] + lw[15];
> > lw += 16;
> > }
> > if (len >= 0) {
> > sum += lw[0] + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7]
> > + lw[8] + lw[9] + lw[10] + lw[11]
> > + lw[12] + lw[13] + lw[14] + lw[15];
> > lw += 16;
> > } else {
> > len += 64;
> > }
> > while ((len -= 16) >= 0) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > len += 16;
> > while ((len -= 4) >= 0) {
> > sum += (u_int64_t) *lw++;
> > }
> > len += 4;
> > if (len > 0)
> > sum += (u_int64_t) (in_masks[len] & *lw);
> > REDUCE32;
> > return sum;
> > }
> > #endif
> >
> > #ifdef _64BYTES_WITH_PREFETCH_INSTRUCTION
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > const u_int32_t *lw = (const u_int32_t *) buf;
> > u_int64_t sum = 0;
> > int offset;
> > union q_util q_util;
> >
> > if ((3 & (long) lw) == 0 && len == 20) {
> > sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > REDUCE32;
> > return sum;
> > }
> >
> > if ((offset = 3 & (long) lw) != 0) {
> > const u_int32_t *masks = in_masks + (offset << 2);
> > lw = (u_int32_t *) (((long) lw) - offset);
> > sum = *lw++ & masks[len >= 3 ? 3 : len];
> > len -= 4 - offset;
> > if (len <= 0) {
> > REDUCE32;
> > return sum;
> > }
> > }
> > #if 0
> > /*
> > * Force to cache line boundary.
> > */
> > offset = 32 - (0x1f & (long) lw);
> > if (offset < 32 && len > offset) {
> > len -= offset;
> > if (4 & offset) {
> > sum += (u_int64_t) lw[0];
> > lw += 1;
> > }
> > if (8 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1];
> > lw += 2;
> > }
> > if (16 & offset) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > }
> > #endif
> > /*
> > * access prefilling to start load of next cache line.
> > * then add current cache line
> > * save result of prefilling for loop iteration.
> > */
> > __builtin_prefetch(&lw[0]);
> > while ((len -= 64) >= 4) {
> > __builtin_prefetch(&lw[16]);
> > sum += lw[0] + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7]
> > + lw[8] + lw[9] + lw[10] + lw[11]
> > + lw[12] + lw[13] + lw[14] + lw[15];
> > lw += 16;
> > }
> > if (len >= 0) {
> > sum += lw[0] + lw[1] + lw[2] + lw[3]
> > + lw[4] + lw[5] + lw[6] + lw[7]
> > + lw[8] + lw[9] + lw[10] + lw[11]
> > + lw[12] + lw[13] + lw[14] + lw[15];
> > lw += 16;
> > } else {
> > len += 64;
> > }
> > while ((len -= 16) >= 0) {
> > sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > lw += 4;
> > }
> > len += 16;
> > while ((len -= 4) >= 0) {
> > sum += (u_int64_t) *lw++;
> > }
> > len += 4;
> > if (len > 0)
> > sum += (u_int64_t) (in_masks[len] & *lw);
> > REDUCE32;
> > return sum;
> > }
> > #endif
> >
> > /* ------------------------------------------------------------------------ */
> >
> > int main(void)
> > {
> > int i;
> > int sum;
> > struct timeval tv1, tv2, res;
> >
> > gettimeofday(&tv1, NULL);
> > for (i = 0; i < BUFFER_SIZE; i += PACKET_SIZE)
> > sum = in_cksumdata(&buffer[i], PACKET_SIZE);
> > gettimeofday(&tv2, NULL);
> >
> > timersub(&tv2, &tv1, &res);
> > printf("%ld.%6ld\n", res.tv_sec, res.tv_usec);
> >
> > return (0);
> > }
> >
More information about the freebsd-hackers
mailing list