Fwd: Questions with the in_cksumdata() function in sys/amd64/amd64/in_cksum.c

Mon Oct 20 10:35:26 UTC 2014

> > > I would be not surprised if this manual prefetching by explicit reads
> > > causes slowdown of the function.  I suspect it could confuse hardware
> > > prefetcher by breaking the linear pattern, or the patch could break
> > > the logic of the limited forward-looking oracle by reading too far
> > > from the current linear read tip.
> > > 
> > > Also, it could confuse the data flow engine if the register allocator
> > > is unable to see that the read value is needed not right now, and cause
> > > unneeded stall while next cache line is fetched.
> > > 
> > > Sure, all my speculations are pure garbage until confirmed by
> > > measurements with pmc, but I think that the patch below must be
> > > benchmarked to confirm any value it provides as well. My opinion is,
> > > we should either remove the manual prefetch, or do it with PREFETCHLX
> > > instructions only, instead of real read.
> > 
> > I have done a rather simple test. And the results are listed as follows:
> > 
> Yes, too simple to draw conclusion, IMO.
> 
> Please look at the ministat(1).  I think that the test run length
> is too short to come with any decisions.  The length x 3 runs does not
> give enough confidence; but ministat would provide the numbers to judge.

I have run ministat with these results, and the following is the output of
ministat. It said that the difference is at 99.5% confidence.

$ ministat -w 80 -s -c 99.5 32BYTES_WITH_PRE_READ 32BYTES_WITHOUT_MANUAL_PREFETCH 64BYTES_WITH_PRE_READ 64BYTES_WITHOUT_MANUAL_PREFETCH 64BYTES_WITH_PREFETCH_INSTRUCTION
x 32BYTES_WITH_PRE_READ
+ 32BYTES_WITHOUT_MANUAL_PREFETCH
* 64BYTES_WITH_PRE_READ
% 64BYTES_WITHOUT_MANUAL_PREFETCH
# 64BYTES_WITH_PREFETCH_INSTRUCTION
+--------------------------------------------------------------------------------+
|%              #                                                                |
|%    ***       #   #         ++ +                                         xx   x|
|                                                                         |_MA__||
|                             |A_|                                               |
|     |A|                                                                        |
|A                                                                               |
|              |MA_|                                                             |
+--------------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x   3      0.768854      0.773803      0.770332    0.77099633  0.0025405028
+   3      0.725956      0.728841      0.726651    0.72714933  0.0015056754
Difference at 99.5% confidence
	-0.043847 +/- 0.0122301
	-5.68706% +/- 1.58627%
	(Student's t, pooled s = 0.00208821)
*   3      0.702416      0.704498      0.703648    0.70352067  0.0010468244
Difference at 99.5% confidence
	-0.0674757 +/- 0.0113792
	-8.75175% +/- 1.47591%
	(Student's t, pooled s = 0.00194294)
%   3      0.697971      0.698314      0.698071    0.69811867 0.00017639822
Difference at 99.5% confidence
	-0.0728777 +/- 0.0105464
	-9.4524% +/- 1.36789%
	(Student's t, pooled s = 0.00180073)
#   3      0.711962      0.715883      0.712118      0.713321  0.0022201277
Difference at 99.5% confidence
	-0.0576753 +/- 0.0139724
	-7.48062% +/- 1.81225%
	(Student's t, pooled s = 0.0023857)

> > #1. Read 32 bytes with manual pre-read in each loop:
> > 
> > $ cc main.c -D_32BYTES_WITH_PRE_READ
> > $ for i in `seq 3`; do ./a.out; done
> > 0.768854
> > 0.770332
> > 0.773803
> > 
> > #2. Read 64 bytes with manual pre-read in each loop:
> > 
> > $ cc main.c -D_64BYTES_WITH_PRE_READ
> > $ for i in `seq 3`; do ./a.out; done
> > 0.702416
> > 0.703648
> > 0.704498
> > 
> > #3. Read 32 bytes without manual prefetch in each loop:
> > 
> > $ cc main.c -D_32BYTES_WITHOUT_MANUAL_PREFETCH
> > $ for i in `seq 3`; do ./a.out; done
> > 0.726651
> > 0.728841
> > 0.725956
> > 
> > #4. Read 64 bytes without manual prefetch in each loop:
> > 
> > $ cc main.c -D_64BYTES_WITHOUT_MANUAL_PREFETCH
> > $ for i in `seq 3`; do ./a.out; done
> > 0.698071
> > 0.697971
> > 0.698314
> > 
> > #5. Read 64 bytes with PREFETCH instruction:
> > 
> > $ cc main.c -D_64BYTES_WITH_PREFETCH_INSTRUCTION
> > $ for i in `seq 3`; do ./a.out; done
> > 0.715883
> > 0.712118
> > 0.711962
> > 
> > The test is very simple. I just run the in_cksumdata() function on one
> > million packets. And the result is the time spent on calculating these
> > checksums.
> > 
> > As we can see from the results, when reading 64 bytes data without manual
> > prefetch operation in each loop, the speed is fastest. So, I think read
> > a whole cache line in each loop is helpful.
> > 
> > ---
> > 
> > The computer that I run the test program on:
> > 
> > $ dmesg | grep CPU:
> > CPU: Intel(R) Core(TM) i5-2400 CPU @ 3.10GHz (3093.03-MHz K8-class CPU)
> > 
> > ---
> > 
> > The test program:
> > 
> > #include <stdio.h>
> > #include <sys/types.h>
> > #include <sys/time.h>
> > 
> > /* ------------------------------------------------------------------------ */
> > 
> > #define PACKET_SIZE 1500
> > #define BUFFER_SIZE ((PACKET_SIZE) << 20)
> > static unsigned char buffer[BUFFER_SIZE];
> > 
> > /* ------------------------------------------------------------------------ */
> > 
> > /*
> >  * Checksum routine for Internet Protocol family headers
> >  *    (Portable Alpha version).
> >  *
> >  * This routine is very heavily used in the network
> >  * code and should be modified for each CPU to be as fast as possible.
> >  */
> > 
> > #define ADDCARRY(x)  (x > 65535 ? x -= 65535 : x)
> > #define REDUCE32							  \
> >     {									  \
> > 	q_util.q = sum;							  \
> > 	sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3];	  \
> >     }
> > #define REDUCE16							  \
> >     {									  \
> > 	q_util.q = sum;							  \
> > 	l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
> > 	sum = l_util.s[0] + l_util.s[1];				  \
> > 	ADDCARRY(sum);							  \
> >     }
> > 
> > static const u_int32_t in_masks[] = {
> > 	/*0 bytes*/ /*1 byte*/	/*2 bytes*/ /*3 bytes*/
> > 	0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF,	/* offset 0 */
> > 	0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00,	/* offset 1 */
> > 	0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000,	/* offset 2 */
> > 	0x00000000, 0xFF000000, 0xFF000000, 0xFF000000,	/* offset 3 */
> > };
> > 
> > union l_util {
> > 	u_int16_t s[2];
> > 	u_int32_t l;
> > };
> > union q_util {
> > 	u_int16_t s[4];
> > 	u_int32_t l[2];
> > 	u_int64_t q;
> > };
> > 
> > /* ------------------------------------------------------------------------ */
> > 
> > //#define _32BYTES_WITH_PRE_READ
> > //#define _64BYTES_WITH_PRE_READ
> > //#define _32BYTES_WITHOUT_MANUAL_PREFETCH
> > //#define _64BYTES_WITHOUT_MANUAL_PREFETCH
> > //#define _64BYTES_WITH_PREFETCH_INSTRUCTION
> > 
> > #ifdef _32BYTES_WITH_PRE_READ
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > 	const u_int32_t *lw = (const u_int32_t *) buf;
> > 	u_int64_t sum = 0;
> > 	u_int64_t prefilled;
> > 	int offset;
> > 	union q_util q_util;
> > 
> > 	if ((3 & (long) lw) == 0 && len == 20) {
> > 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > 	     REDUCE32;
> > 	     return sum;
> > 	}
> > 
> > 	if ((offset = 3 & (long) lw) != 0) {
> > 		const u_int32_t *masks = in_masks + (offset << 2);
> > 		lw = (u_int32_t *) (((long) lw) - offset);
> > 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> > 		len -= 4 - offset;
> > 		if (len <= 0) {
> > 			REDUCE32;
> > 			return sum;
> > 		}
> > 	}
> > #if 0
> > 	/*
> > 	 * Force to cache line boundary.
> > 	 */
> > 	offset = 32 - (0x1f & (long) lw);
> > 	if (offset < 32 && len > offset) {
> > 		len -= offset;
> > 		if (4 & offset) {
> > 			sum += (u_int64_t) lw[0];
> > 			lw += 1;
> > 		}
> > 		if (8 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1];
> > 			lw += 2;
> > 		}
> > 		if (16 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 			lw += 4;
> > 		}
> > 	}
> > #endif
> > 	/*
> > 	 * access prefilling to start load of next cache line.
> > 	 * then add current cache line
> > 	 * save result of prefilling for loop iteration.
> > 	 */
> > 	prefilled = lw[0];
> > 	while ((len -= 32) >= 4) {
> > 		u_int64_t prefilling = lw[8];
> > 		sum += prefilled + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7];
> > 		lw += 8;
> > 		prefilled = prefilling;
> > 	}
> > 	if (len >= 0) {
> > 		sum += prefilled + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7];
> > 		lw += 8;
> > 	} else {
> > 		len += 32;
> > 	}
> > 	while ((len -= 16) >= 0) {
> > 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 		lw += 4;
> > 	}
> > 	len += 16;
> > 	while ((len -= 4) >= 0) {
> > 		sum += (u_int64_t) *lw++;
> > 	}
> > 	len += 4;
> > 	if (len > 0)
> > 		sum += (u_int64_t) (in_masks[len] & *lw);
> > 	REDUCE32;
> > 	return sum;
> > }
> > #endif
> > 
> > #ifdef _64BYTES_WITH_PRE_READ
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > 	const u_int32_t *lw = (const u_int32_t *) buf;
> > 	u_int64_t sum = 0;
> > 	u_int64_t prefilled;
> > 	int offset;
> > 	union q_util q_util;
> > 
> > 	if ((3 & (long) lw) == 0 && len == 20) {
> > 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > 	     REDUCE32;
> > 	     return sum;
> > 	}
> > 
> > 	if ((offset = 3 & (long) lw) != 0) {
> > 		const u_int32_t *masks = in_masks + (offset << 2);
> > 		lw = (u_int32_t *) (((long) lw) - offset);
> > 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> > 		len -= 4 - offset;
> > 		if (len <= 0) {
> > 			REDUCE32;
> > 			return sum;
> > 		}
> > 	}
> > #if 0
> > 	/*
> > 	 * Force to cache line boundary.
> > 	 */
> > 	offset = 32 - (0x1f & (long) lw);
> > 	if (offset < 32 && len > offset) {
> > 		len -= offset;
> > 		if (4 & offset) {
> > 			sum += (u_int64_t) lw[0];
> > 			lw += 1;
> > 		}
> > 		if (8 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1];
> > 			lw += 2;
> > 		}
> > 		if (16 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 			lw += 4;
> > 		}
> > 	}
> > #endif
> > 	/*
> > 	 * access prefilling to start load of next cache line.
> > 	 * then add current cache line
> > 	 * save result of prefilling for loop iteration.
> > 	 */
> > 	prefilled = lw[0];
> > 	while ((len -= 64) >= 4) {
> > 		u_int64_t prefilling = lw[16];
> > 		sum += prefilled + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7]
> > 			+ lw[8] + lw[9] + lw[10] + lw[11]
> > 			+ lw[12] + lw[13] + lw[14] + lw[15];
> > 		lw += 16;
> > 		prefilled = prefilling;
> > 	}
> > 	if (len >= 0) {
> > 		sum += prefilled + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7]
> > 			+ lw[8] + lw[9] + lw[10] + lw[11]
> > 			+ lw[12] + lw[13] + lw[14] + lw[15];
> > 		lw += 16;
> > 	} else {
> > 		len += 64;
> > 	}
> > 	while ((len -= 16) >= 0) {
> > 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 		lw += 4;
> > 	}
> > 	len += 16;
> > 	while ((len -= 4) >= 0) {
> > 		sum += (u_int64_t) *lw++;
> > 	}
> > 	len += 4;
> > 	if (len > 0)
> > 		sum += (u_int64_t) (in_masks[len] & *lw);
> > 	REDUCE32;
> > 	return sum;
> > }
> > #endif
> > 
> > #ifdef _32BYTES_WITHOUT_MANUAL_PREFETCH
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > 	const u_int32_t *lw = (const u_int32_t *) buf;
> > 	u_int64_t sum = 0;
> > 	int offset;
> > 	union q_util q_util;
> > 
> > 	if ((3 & (long) lw) == 0 && len == 20) {
> > 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > 	     REDUCE32;
> > 	     return sum;
> > 	}
> > 
> > 	if ((offset = 3 & (long) lw) != 0) {
> > 		const u_int32_t *masks = in_masks + (offset << 2);
> > 		lw = (u_int32_t *) (((long) lw) - offset);
> > 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> > 		len -= 4 - offset;
> > 		if (len <= 0) {
> > 			REDUCE32;
> > 			return sum;
> > 		}
> > 	}
> > #if 0
> > 	/*
> > 	 * Force to cache line boundary.
> > 	 */
> > 	offset = 32 - (0x1f & (long) lw);
> > 	if (offset < 32 && len > offset) {
> > 		len -= offset;
> > 		if (4 & offset) {
> > 			sum += (u_int64_t) lw[0];
> > 			lw += 1;
> > 		}
> > 		if (8 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1];
> > 			lw += 2;
> > 		}
> > 		if (16 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 			lw += 4;
> > 		}
> > 	}
> > #endif
> > 	/*
> > 	 * access prefilling to start load of next cache line.
> > 	 * then add current cache line
> > 	 * save result of prefilling for loop iteration.
> > 	 */
> > 	while ((len -= 32) >= 4) {
> > 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7];
> > 		lw += 8;
> > 	}
> > 	if (len >= 0) {
> > 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7];
> > 		lw += 8;
> > 	} else {
> > 		len += 32;
> > 	}
> > 	while ((len -= 16) >= 0) {
> > 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 		lw += 4;
> > 	}
> > 	len += 16;
> > 	while ((len -= 4) >= 0) {
> > 		sum += (u_int64_t) *lw++;
> > 	}
> > 	len += 4;
> > 	if (len > 0)
> > 		sum += (u_int64_t) (in_masks[len] & *lw);
> > 	REDUCE32;
> > 	return sum;
> > }
> > #endif
> > 
> > #ifdef _64BYTES_WITHOUT_MANUAL_PREFETCH
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > 	const u_int32_t *lw = (const u_int32_t *) buf;
> > 	u_int64_t sum = 0;
> > 	int offset;
> > 	union q_util q_util;
> > 
> > 	if ((3 & (long) lw) == 0 && len == 20) {
> > 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > 	     REDUCE32;
> > 	     return sum;
> > 	}
> > 
> > 	if ((offset = 3 & (long) lw) != 0) {
> > 		const u_int32_t *masks = in_masks + (offset << 2);
> > 		lw = (u_int32_t *) (((long) lw) - offset);
> > 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> > 		len -= 4 - offset;
> > 		if (len <= 0) {
> > 			REDUCE32;
> > 			return sum;
> > 		}
> > 	}
> > #if 0
> > 	/*
> > 	 * Force to cache line boundary.
> > 	 */
> > 	offset = 32 - (0x1f & (long) lw);
> > 	if (offset < 32 && len > offset) {
> > 		len -= offset;
> > 		if (4 & offset) {
> > 			sum += (u_int64_t) lw[0];
> > 			lw += 1;
> > 		}
> > 		if (8 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1];
> > 			lw += 2;
> > 		}
> > 		if (16 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 			lw += 4;
> > 		}
> > 	}
> > #endif
> > 	/*
> > 	 * access prefilling to start load of next cache line.
> > 	 * then add current cache line
> > 	 * save result of prefilling for loop iteration.
> > 	 */
> > 	while ((len -= 64) >= 4) {
> > 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7]
> > 			+ lw[8] + lw[9] + lw[10] + lw[11]
> > 			+ lw[12] + lw[13] + lw[14] + lw[15];
> > 		lw += 16;
> > 	}
> > 	if (len >= 0) {
> > 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7]
> > 			+ lw[8] + lw[9] + lw[10] + lw[11]
> > 			+ lw[12] + lw[13] + lw[14] + lw[15];
> > 		lw += 16;
> > 	} else {
> > 		len += 64;
> > 	}
> > 	while ((len -= 16) >= 0) {
> > 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 		lw += 4;
> > 	}
> > 	len += 16;
> > 	while ((len -= 4) >= 0) {
> > 		sum += (u_int64_t) *lw++;
> > 	}
> > 	len += 4;
> > 	if (len > 0)
> > 		sum += (u_int64_t) (in_masks[len] & *lw);
> > 	REDUCE32;
> > 	return sum;
> > }
> > #endif
> > 
> > #ifdef _64BYTES_WITH_PREFETCH_INSTRUCTION
> > static u_int64_t
> > in_cksumdata(const void *buf, int len)
> > {
> > 	const u_int32_t *lw = (const u_int32_t *) buf;
> > 	u_int64_t sum = 0;
> > 	int offset;
> > 	union q_util q_util;
> > 
> > 	if ((3 & (long) lw) == 0 && len == 20) {
> > 	     sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
> > 	     REDUCE32;
> > 	     return sum;
> > 	}
> > 
> > 	if ((offset = 3 & (long) lw) != 0) {
> > 		const u_int32_t *masks = in_masks + (offset << 2);
> > 		lw = (u_int32_t *) (((long) lw) - offset);
> > 		sum = *lw++ & masks[len >= 3 ? 3 : len];
> > 		len -= 4 - offset;
> > 		if (len <= 0) {
> > 			REDUCE32;
> > 			return sum;
> > 		}
> > 	}
> > #if 0
> > 	/*
> > 	 * Force to cache line boundary.
> > 	 */
> > 	offset = 32 - (0x1f & (long) lw);
> > 	if (offset < 32 && len > offset) {
> > 		len -= offset;
> > 		if (4 & offset) {
> > 			sum += (u_int64_t) lw[0];
> > 			lw += 1;
> > 		}
> > 		if (8 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1];
> > 			lw += 2;
> > 		}
> > 		if (16 & offset) {
> > 			sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 			lw += 4;
> > 		}
> > 	}
> > #endif
> > 	/*
> > 	 * access prefilling to start load of next cache line.
> > 	 * then add current cache line
> > 	 * save result of prefilling for loop iteration.
> > 	 */
> > 	__builtin_prefetch(&lw[0]);
> > 	while ((len -= 64) >= 4) {
> > 		__builtin_prefetch(&lw[16]);
> > 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7]
> > 			+ lw[8] + lw[9] + lw[10] + lw[11]
> > 			+ lw[12] + lw[13] + lw[14] + lw[15];
> > 		lw += 16;
> > 	}
> > 	if (len >= 0) {
> > 		sum += lw[0] + lw[1] + lw[2] + lw[3]
> > 			+ lw[4] + lw[5] + lw[6] + lw[7]
> > 			+ lw[8] + lw[9] + lw[10] + lw[11]
> > 			+ lw[12] + lw[13] + lw[14] + lw[15];
> > 		lw += 16;
> > 	} else {
> > 		len += 64;
> > 	}
> > 	while ((len -= 16) >= 0) {
> > 		sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
> > 		lw += 4;
> > 	}
> > 	len += 16;
> > 	while ((len -= 4) >= 0) {
> > 		sum += (u_int64_t) *lw++;
> > 	}
> > 	len += 4;
> > 	if (len > 0)
> > 		sum += (u_int64_t) (in_masks[len] & *lw);
> > 	REDUCE32;
> > 	return sum;
> > }
> > #endif
> > 
> > /* ------------------------------------------------------------------------ */
> > 
> > int main(void)
> > {
> > 	int i;
> > 	int sum;
> > 	struct timeval tv1, tv2, res;
> > 
> > 	gettimeofday(&tv1, NULL);
> > 	for (i = 0; i < BUFFER_SIZE; i += PACKET_SIZE)
> > 		sum = in_cksumdata(&buffer[i], PACKET_SIZE);
> > 	gettimeofday(&tv2, NULL);
> > 
> > 	timersub(&tv2, &tv1, &res);
> > 	printf("%ld.%6ld\n", res.tv_sec, res.tv_usec);
> > 
> > 	return (0);
> > }
> >