svn commit: r346175 - head/usr.bin/sort
Conrad Meyer
cem at FreeBSD.org
Tue Sep 3 14:07:36 UTC 2019
Author: cem
Date: Sat Apr 13 04:42:17 2019
New Revision: 346175
URL: https://svnweb.freebsd.org/changeset/base/346175
Log:
sort(1): Memoize MD5 computation to reduce repeated computation
Experimentally, reduces sort -R time of a 148160 line corpus from about
3.15s to about 0.93s on this particular system.
There's probably room for improvement using some digest other than md5, but
I don't want to look at sort(1) anymore. Some discussion of other possible
improvements in the Test Plan section of the Differential.
PR: 230792
Reviewed by: jhb (earlier version)
Differential Revision: https://reviews.freebsd.org/D19885
Modified:
head/usr.bin/sort/coll.c
head/usr.bin/sort/coll.h
head/usr.bin/sort/sort.c
Modified: head/usr.bin/sort/coll.c
==============================================================================
--- head/usr.bin/sort/coll.c Sat Apr 13 04:03:18 2019 (r346174)
+++ head/usr.bin/sort/coll.c Sat Apr 13 04:42:17 2019 (r346175)
@@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2,
return (numcoll_impl(kv1, kv2, offset, true));
}
+/* Use hint space to memoize md5 computations, at least. */
+static void
+randomcoll_init_hint(struct key_value *kv, void *hash)
+{
+
+ memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
+ kv->hint->status = HS_INITIALIZED;
+}
+
/*
* Implements random sort (-R).
*/
@@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv
struct bwstring *s1, *s2;
MD5_CTX ctx1, ctx2;
unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
+ int cmp;
s1 = kv1->k;
s2 = kv2->k;
@@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv
if (s1 == s2)
return (0);
+ if (kv1->hint->status == HS_INITIALIZED &&
+ kv2->hint->status == HS_INITIALIZED) {
+ cmp = memcmp(kv1->hint->v.Rh.cached,
+ kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
+ if (cmp != 0)
+ return (cmp);
+ }
+
memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
@@ -1011,6 +1029,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv
MD5Final(hash1, &ctx1);
MD5Final(hash2, &ctx2);
+
+ if (kv1->hint->status == HS_UNINITIALIZED)
+ randomcoll_init_hint(kv1, hash1);
+ if (kv2->hint->status == HS_UNINITIALIZED)
+ randomcoll_init_hint(kv2, hash2);
return (memcmp(hash1, hash2, sizeof(hash1)));
}
Modified: head/usr.bin/sort/coll.h
==============================================================================
--- head/usr.bin/sort/coll.h Sat Apr 13 04:03:18 2019 (r346174)
+++ head/usr.bin/sort/coll.h Sat Apr 13 04:42:17 2019 (r346175)
@@ -65,6 +65,17 @@ struct M_hint
};
/*
+ * Sort hint data for -R
+ *
+ * This stores the first 12 bytes of the digest rather than the full output to
+ * avoid increasing the size of the 'key_hint' object via the 'v' union.
+ */
+struct R_hint
+{
+ unsigned char cached[12];
+};
+
+/*
* Status of a sort hint object
*/
typedef enum
@@ -83,6 +94,7 @@ struct key_hint
struct n_hint nh;
struct g_hint gh;
struct M_hint Mh;
+ struct R_hint Rh;
} v;
};
Modified: head/usr.bin/sort/sort.c
==============================================================================
--- head/usr.bin/sort/sort.c Sat Apr 13 04:03:18 2019 (r346174)
+++ head/usr.bin/sort/sort.c Sat Apr 13 04:42:17 2019 (r346175)
@@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
break;
case 'R':
sm->Rflag = true;
+ need_hint = true;
need_random = true;
break;
case 'M':
More information about the svn-src-all
mailing list