PERFORCE change 28488 for review
Peter Wemm
peter at FreeBSD.org
Mon Apr 7 23:18:30 PDT 2003
http://perforce.freebsd.org/chv.cgi?CH=28488
Change 28488 by peter at peter_daintree on 2003/04/07 23:17:56
use the portable in_cksum from powerpc for now
Affected files ...
.. //depot/projects/hammer/sys/x86_64/include/in_cksum.h#2 edit
.. //depot/projects/hammer/sys/x86_64/x86_64/in_cksum.c#2 edit
Differences ...
==== //depot/projects/hammer/sys/x86_64/include/in_cksum.h#2 (text+ko) ====
@@ -33,16 +33,12 @@
* from tahoe: in_cksum.c 1.2 86/01/05
* from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
* from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp
- * $FreeBSD: src/sys/i386/include/in_cksum.h,v 1.13 2002/06/22 22:35:53 jdp Exp $
+ * $FreeBSD: src/sys/powerpc/include/in_cksum.h,v 1.1 2002/06/29 09:49:24 benno Exp $
*/
#ifndef _MACHINE_IN_CKSUM_H_
#define _MACHINE_IN_CKSUM_H_ 1
-/*
- * MP safe (alfred)
- */
-
#include <sys/cdefs.h>
#define in_cksum(m, len) in_cksum_skip(m, len, 0)
@@ -54,33 +50,6 @@
* therefore always exactly five 32-bit words.
*/
#ifdef __GNUC__
-static __inline u_int
-in_cksum_hdr(const struct ip *ip)
-{
- register u_int sum = 0;
-
-/* __volatile is necessary here because the condition codes are used. */
-#define ADD(n) __asm __volatile ("addl %1, %0" : "+r" (sum) : \
- "g" (((const u_int32_t *)ip)[n / 4]))
-#define ADDC(n) __asm __volatile ("adcl %1, %0" : "+r" (sum) : \
- "g" (((const u_int32_t *)ip)[n / 4]))
-#define MOP __asm __volatile ("adcl $0, %0" : "+r" (sum))
-
- ADD(0);
- ADDC(4);
- ADDC(8);
- ADDC(12);
- ADDC(16);
- MOP;
-#undef ADD
-#undef ADDC
-#undef MOP
- sum = (sum & 0xffff) + (sum >> 16);
- if (sum > 0xffff)
- sum -= 0xffff;
-
- return ~sum & 0xffff;
-}
static __inline void
in_cksum_update(struct ip *ip)
@@ -90,32 +59,8 @@
ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16));
}
-static __inline u_short
-in_addword(u_short sum, u_short b)
-{
- /* __volatile is necessary because the condition codes are used. */
- __asm __volatile ("addw %1, %0" : "+r" (sum) : "r" (b));
- __asm __volatile ("adcw $0, %0" : "+r" (sum));
+#else
- return (sum);
-}
-
-static __inline u_short
-in_pseudo(u_int sum, u_int b, u_int c)
-{
- /* __volatile is necessary because the condition codes are used. */
- __asm __volatile ("addl %1, %0" : "+r" (sum) : "g" (b));
- __asm __volatile ("adcl %1, %0" : "+r" (sum) : "g" (c));
- __asm __volatile ("adcl $0, %0" : "+r" (sum));
-
- sum = (sum & 0xffff) + (sum >> 16);
- if (sum > 0xffff)
- sum -= 0xffff;
- return (sum);
-}
-
-#else
-u_int in_cksum_hdr(const struct ip *);
#define in_cksum_update(ip) \
do { \
int __tmpsum; \
@@ -126,7 +71,10 @@
#endif
#ifdef _KERNEL
-u_short in_cksum_skip(struct mbuf *m, int len, int skip);
-#endif /* _KERNEL */
+u_int in_cksum_hdr(const struct ip *ip);
+u_short in_addword(u_short sum, u_short b);
+u_short in_pseudo(u_int sum, u_int b, u_int c);
+u_short in_cksum_skip(struct mbuf *m, int len, int skip);
+#endif
#endif /* _MACHINE_IN_CKSUM_H_ */
==== //depot/projects/hammer/sys/x86_64/x86_64/in_cksum.c#2 (text+ko) ====
@@ -1,6 +1,11 @@
-/*-
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
+/* $FreeBSD: src/sys/powerpc/powerpc/in_cksum.c,v 1.2 2003/02/13 08:56:41 grehan Exp $ */
+/* $NetBSD: in_cksum.c,v 1.7 1997/09/02 13:18:15 thorpej Exp $ */
+
+/*
+ * Copyright (c) 1988, 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 1996
+ * Matt Thomas <matt at 3am-software.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -30,241 +35,216 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from tahoe: in_cksum.c 1.2 86/01/05
- * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91
- * $FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.24 2002/06/22 22:35:49 jdp Exp $
+ * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
-/*
- * MPsafe: alfred
- */
+#include <sys/cdefs.h> /* RCS ID & Copyright macro defns */
+
#include <sys/param.h>
+#include <sys/mbuf.h>
#include <sys/systm.h>
-#include <sys/mbuf.h>
-
+#include <netinet/in_systm.h>
#include <netinet/in.h>
-#include <netinet/in_systm.h>
#include <netinet/ip.h>
-
#include <machine/in_cksum.h>
/*
- * Checksum routine for Internet Protocol family headers.
+ * Checksum routine for Internet Protocol family headers
+ * (Portable Alpha version).
*
* This routine is very heavily used in the network
* code and should be modified for each CPU to be as fast as possible.
- *
- * This implementation is 386 version.
*/
-#undef ADDCARRY
-#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff
-#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);}
+#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x)
+#define REDUCE32 \
+ { \
+ q_util.q = sum; \
+ sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
+ }
+#define REDUCE16 \
+ { \
+ q_util.q = sum; \
+ l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \
+ sum = l_util.s[0] + l_util.s[1]; \
+ ADDCARRY(sum); \
+ }
+
+static const u_int32_t in_masks[] = {
+#if 0
+ /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/
+ 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */
+ 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */
+ 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */
+ 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */
+#else
+ /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/
+ 0x00000000, 0xFF000000, 0xFFFF0000, 0xFFFFFF00, /* offset 0 */
+ 0x00000000, 0x00FF0000, 0x00FFFF00, 0x00FFFFFF, /* offset 1 */
+ 0x00000000, 0x0000FF00, 0x0000FFFF, 0x0000FFFF, /* offset 2 */
+ 0x00000000, 0x000000FF, 0x000000FF, 0x000000FF, /* offset 3 */
+#endif
+};
-/*
- * These asm statements require __volatile because they pass information
- * via the condition codes. GCC does not currently provide a way to specify
- * the condition codes as an input or output operand.
- *
- * The LOAD macro below is effectively a prefetch into cache. GCC will
- * load the value into a register but will not use it. Since modern CPUs
- * reorder operations, this will generally take place in parallel with
- * other calculations.
- */
-#define ADD(n) __asm __volatile \
- ("addl %1, %0" : "+r" (sum) : \
- "g" (((const u_int32_t *)w)[n / 4]))
-#define ADDC(n) __asm __volatile \
- ("adcl %1, %0" : "+r" (sum) : \
- "g" (((const u_int32_t *)w)[n / 4]))
-#define LOAD(n) __asm __volatile \
- ("" : : "r" (((const u_int32_t *)w)[n / 4]))
-#define MOP __asm __volatile \
- ("adcl $0, %0" : "+r" (sum))
+union l_util {
+ u_int16_t s[2];
+ u_int32_t l;
+};
+union q_util {
+ u_int16_t s[4];
+ u_int32_t l[2];
+ u_int64_t q;
+};
-u_short
-in_cksum_skip(m, len, skip)
- struct mbuf *m;
- int len;
- int skip;
+static u_int64_t
+in_cksumdata(const void *buf, int len)
{
- register u_short *w;
- register unsigned sum = 0;
- register int mlen = 0;
- int byte_swapped = 0;
- union { char c[2]; u_short s; } su;
+ const u_int32_t *lw = (const u_int32_t *) buf;
+ u_int64_t sum = 0;
+ u_int64_t prefilled;
+ int offset;
+ union q_util q_util;
+
+ if ((3 & (long) lw) == 0 && len == 20) {
+ sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4];
+ REDUCE32;
+ return sum;
+ }
- len -= skip;
- for (; skip && m; m = m->m_next) {
- if (m->m_len > skip) {
- mlen = m->m_len - skip;
- w = (u_short *)(mtod(m, u_char *) + skip);
- goto skip_start;
- } else {
- skip -= m->m_len;
+ if ((offset = 3 & (long) lw) != 0) {
+ const u_int32_t *masks = in_masks + (offset << 2);
+ lw = (u_int32_t *) (((long) lw) - offset);
+ sum = *lw++ & masks[len >= 3 ? 3 : len];
+ len -= 4 - offset;
+ if (len <= 0) {
+ REDUCE32;
+ return sum;
+ }
+ }
+#if 0
+ /*
+ * Force to cache line boundary.
+ */
+ offset = 32 - (0x1f & (long) lw);
+ if (offset < 32 && len > offset) {
+ len -= offset;
+ if (4 & offset) {
+ sum += (u_int64_t) lw[0];
+ lw += 1;
+ }
+ if (8 & offset) {
+ sum += (u_int64_t) lw[0] + lw[1];
+ lw += 2;
+ }
+ if (16 & offset) {
+ sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
+ lw += 4;
}
}
+#endif
+ /*
+ * access prefilling to start load of next cache line.
+ * then add current cache line
+ * save result of prefilling for loop iteration.
+ */
+ prefilled = lw[0];
+ while ((len -= 32) >= 4) {
+ u_int64_t prefilling = lw[8];
+ sum += prefilled + lw[1] + lw[2] + lw[3]
+ + lw[4] + lw[5] + lw[6] + lw[7];
+ lw += 8;
+ prefilled = prefilling;
+ }
+ if (len >= 0) {
+ sum += prefilled + lw[1] + lw[2] + lw[3]
+ + lw[4] + lw[5] + lw[6] + lw[7];
+ lw += 8;
+ } else {
+ len += 32;
+ }
+ while ((len -= 16) >= 0) {
+ sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3];
+ lw += 4;
+ }
+ len += 16;
+ while ((len -= 4) >= 0) {
+ sum += (u_int64_t) *lw++;
+ }
+ len += 4;
+ if (len > 0)
+ sum += (u_int64_t) (in_masks[len] & *lw);
+ REDUCE32;
+ return sum;
+}
+
+u_short
+in_addword(u_short a, u_short b)
+{
+ u_int64_t sum = a + b;
+
+ ADDCARRY(sum);
+ return (sum);
+}
+
+u_short
+in_pseudo(u_int32_t a, u_int32_t b, u_int32_t c)
+{
+ u_int64_t sum;
+ union q_util q_util;
+ union l_util l_util;
+
+ sum = (u_int64_t) a + b + c;
+ REDUCE16;
+ return (sum);
+}
+
+u_short
+in_cksum_skip(struct mbuf *m, int len, int skip)
+{
+ u_int64_t sum = 0;
+ int mlen = 0;
+ int clen = 0;
+ caddr_t addr;
+ union q_util q_util;
+ union l_util l_util;
- for (;m && len; m = m->m_next) {
+ len -= skip;
+ for (; skip && m; m = m->m_next) {
+ if (m->m_len > skip) {
+ mlen = m->m_len - skip;
+ addr = mtod(m, caddr_t) + skip;
+ goto skip_start;
+ } else {
+ skip -= m->m_len;
+ }
+ }
+
+ for (; m && len; m = m->m_next) {
if (m->m_len == 0)
continue;
- w = mtod(m, u_short *);
- if (mlen == -1) {
- /*
- * The first byte of this mbuf is the continuation
- * of a word spanning between this mbuf and the
- * last mbuf.
- */
-
- /* su.c[0] is already saved when scanning previous
- * mbuf. sum was REDUCEd when we found mlen == -1
- */
- su.c[1] = *(u_char *)w;
- sum += su.s;
- w = (u_short *)((char *)w + 1);
- mlen = m->m_len - 1;
- len--;
- } else
- mlen = m->m_len;
+ mlen = m->m_len;
+ addr = mtod(m, caddr_t);
skip_start:
if (len < mlen)
mlen = len;
+
+ if ((clen ^ (long) addr) & 1)
+ sum += in_cksumdata(addr, mlen) << 8;
+ else
+ sum += in_cksumdata(addr, mlen);
+
+ clen += mlen;
len -= mlen;
- /*
- * Force to long boundary so we do longword aligned
- * memory operations
- */
- if (3 & (int) w) {
- REDUCE;
- if ((1 & (int) w) && (mlen > 0)) {
- sum <<= 8;
- su.c[0] = *(char *)w;
- w = (u_short *)((char *)w + 1);
- mlen--;
- byte_swapped = 1;
- }
- if ((2 & (int) w) && (mlen >= 2)) {
- sum += *w++;
- mlen -= 2;
- }
- }
- /*
- * Advance to a 486 cache line boundary.
- */
- if (4 & (int) w && mlen >= 4) {
- ADD(0);
- MOP;
- w += 2;
- mlen -= 4;
- }
- if (8 & (int) w && mlen >= 8) {
- ADD(0);
- ADDC(4);
- MOP;
- w += 4;
- mlen -= 8;
- }
- /*
- * Do as much of the checksum as possible 32 bits at at time.
- * In fact, this loop is unrolled to make overhead from
- * branches &c small.
- */
- mlen -= 1;
- while ((mlen -= 32) >= 0) {
- /*
- * Add with carry 16 words and fold in the last
- * carry by adding a 0 with carry.
- *
- * The early ADD(16) and the LOAD(32) are to load
- * the next 2 cache lines in advance on 486's. The
- * 486 has a penalty of 2 clock cycles for loading
- * a cache line, plus whatever time the external
- * memory takes to load the first word(s) addressed.
- * These penalties are unavoidable. Subsequent
- * accesses to a cache line being loaded (and to
- * other external memory?) are delayed until the
- * whole load finishes. These penalties are mostly
- * avoided by not accessing external memory for
- * 8 cycles after the ADD(16) and 12 cycles after
- * the LOAD(32). The loop terminates when mlen
- * is initially 33 (not 32) to guaranteed that
- * the LOAD(32) is within bounds.
- */
- ADD(16);
- ADDC(0);
- ADDC(4);
- ADDC(8);
- ADDC(12);
- LOAD(32);
- ADDC(20);
- ADDC(24);
- ADDC(28);
- MOP;
- w += 16;
- }
- mlen += 32 + 1;
- if (mlen >= 32) {
- ADD(16);
- ADDC(0);
- ADDC(4);
- ADDC(8);
- ADDC(12);
- ADDC(20);
- ADDC(24);
- ADDC(28);
- MOP;
- w += 16;
- mlen -= 32;
- }
- if (mlen >= 16) {
- ADD(0);
- ADDC(4);
- ADDC(8);
- ADDC(12);
- MOP;
- w += 8;
- mlen -= 16;
- }
- if (mlen >= 8) {
- ADD(0);
- ADDC(4);
- MOP;
- w += 4;
- mlen -= 8;
- }
- if (mlen == 0 && byte_swapped == 0)
- continue; /* worth 1% maybe ?? */
- REDUCE;
- while ((mlen -= 2) >= 0) {
- sum += *w++;
- }
- if (byte_swapped) {
- sum <<= 8;
- byte_swapped = 0;
- if (mlen == -1) {
- su.c[1] = *(char *)w;
- sum += su.s;
- mlen = 0;
- } else
- mlen = -1;
- } else if (mlen == -1)
- /*
- * This mbuf has odd number of bytes.
- * There could be a word split betwen
- * this mbuf and the next mbuf.
- * Save the last byte (to prepend to next mbuf).
- */
- su.c[0] = *(char *)w;
}
+ REDUCE16;
+ return (~sum & 0xffff);
+}
- if (len)
- printf("%s: out of data by %d\n", __func__, len);
- if (mlen == -1) {
- /* The last mbuf has odd # of bytes. Follow the
- standard (the odd byte is shifted left by 8 bits) */
- su.c[1] = 0;
- sum += su.s;
- }
- REDUCE;
- return (~sum & 0xffff);
+u_int in_cksum_hdr(const struct ip *ip)
+{
+ u_int64_t sum = in_cksumdata(ip, sizeof(struct ip));
+ union q_util q_util;
+ union l_util l_util;
+ REDUCE16;
+ return (~sum & 0xffff);
}
More information about the p4-projects
mailing list