git: 4a0fc138e5eb - main - tools: add arc4random_uniform() bias test

From: Robert Clausecker <fuz_at_FreeBSD.org>
Date: Mon, 02 Dec 2024 10:51:07 UTC
The branch main has been updated by fuz:

URL: https://cgit.FreeBSD.org/src/commit/?id=4a0fc138e5eb343e45388e66698a4765b308a622

commit 4a0fc138e5eb343e45388e66698a4765b308a622
Author:     Robert Clausecker <fuz@FreeBSD.org>
AuthorDate: 2024-11-27 17:37:58 +0000
Commit:     Robert Clausecker <fuz@FreeBSD.org>
CommitDate: 2024-12-02 10:41:11 +0000

    tools: add arc4random_uniform() bias test
    
    This test program executed arc4random_uniform() repeatedly and
    analyzes the distribution of return values, showing how similar
    the parameters of the observed outcome are to the expected
    parameters of an equidistribution.
    
    This cannot be a unit test as it takes quite a while to run and
    lots of memory (~3 GB) to execute.
    
    Reviewed by:    cem
    Approved by:    emaste
    Differential Revision:  https://reviews.freebsd.org/D47659
---
 tools/test/README                |   2 +-
 tools/test/arc4random/biastest.c | 216 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 217 insertions(+), 1 deletion(-)

diff --git a/tools/test/README b/tools/test/README
index a54964621cd8..894aa051c6c1 100644
--- a/tools/test/README
+++ b/tools/test/README
@@ -1,4 +1,3 @@
-
 This directory is for standalone test programs.  For the FreeBSD
 Test Suite, which uses Kyua, please see /usr/src/tests/
 
@@ -7,6 +6,7 @@ and either tries to break it or measures its performance.
 
 Please make a subdir per program, and add a brief description to this file.
 
+arc4random	Bias test for arc4random_uniform()
 auxinfo		Return information on page sizes, CPUs, and OS release date.
 devrandom	Programs to test /dev/*random.
 gpioevents	Test delivery of gpio pin-change events to userland.
diff --git a/tools/test/arc4random/biastest.c b/tools/test/arc4random/biastest.c
new file mode 100644
index 000000000000..2fd4363b7869
--- /dev/null
+++ b/tools/test/arc4random/biastest.c
@@ -0,0 +1,216 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2024 Robert Clausecker <fuz@FreeBSD.org>
+ *
+ * biastest.c -- bias test for arc4random_uniform().
+ *
+ * The default configuration of this test has an upper bound of
+ * (3/4) * UINT32_MAX, which should give a high amount of bias in
+ * an incorrect implementation.  If the range reduction is
+ * implemented correctly, the parameters of the statistic should
+ * closely match the expected values.  If not, they'll differ.
+ *
+ * For memory usage reasons, we use an uchar to track the number of
+ * observations per bucket.  If the number of tries is much larger
+ * than upper_bound, the buckets likely overflow.  This is detected
+ * by the test, but will lead to incorrect results.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <signal.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+static void	collect_sample(unsigned char *, long long, uint32_t);
+static void	analyze_sample(const unsigned char *, long long, uint32_t);
+
+static atomic_bool complete = false;
+static long long tries = 5ULL << 32;
+static atomic_llong tries_done = 0;
+
+static void
+usage(const char *argv0)
+{
+	fprintf(stderr, "usage: %s [-n tries] [-t upper_bound]\n", argv0);
+	exit(EXIT_FAILURE);
+}
+
+int
+main(int argc, char *argv[])
+{
+	uint32_t threshold = 3UL << 30;
+	int ch;
+	unsigned char *sample;
+
+	while (ch = getopt(argc, argv, "n:t:"), ch != EOF)
+		switch (ch) {
+		case 'n':
+			tries = atoll(optarg);
+			break;
+
+		case 't':
+			threshold = (uint32_t)atoll(optarg);
+			break;
+
+		default:
+			usage(argv[0]);
+		}
+
+	if (optind != argc)
+		usage(argv[0]);
+
+	if (threshold == 0) {
+		fprintf(stderr, "threshold must be between 1 and %lu\n", (unsigned long)UINT32_MAX);
+		exit(EXIT_FAILURE);
+	}
+
+	sample = calloc(threshold, 1);
+	if (sample == NULL) {
+		perror("calloc(threshold, 1)");
+		return (EXIT_FAILURE);
+	}
+
+	collect_sample(sample, tries, threshold);
+	analyze_sample(sample, tries, threshold);
+}
+
+static void
+progress(int signo)
+{
+	(void)signo;
+
+	if (!complete) {
+		fprintf(stderr, "\r%10lld of %10lld samples taken (%5.2f%% done)",
+		    tries_done, tries, (tries_done * 100.0) / tries);
+
+		signal(SIGALRM, progress);
+		alarm(1);
+	}
+}
+
+static void
+collect_sample(unsigned char *sample, long long tries, uint32_t threshold)
+{
+	long long i;
+	uint32_t x;
+	bool overflowed = false;
+
+	progress(SIGALRM);
+
+	for (i = 0; i < tries; i++) {
+		x = arc4random_uniform(threshold);
+		tries_done++;
+		assert(x < threshold);
+
+		if (sample[x] == UCHAR_MAX) {
+			if (!overflowed) {
+				printf("sample table overflow, results will be incorrect\n");
+				overflowed = true;
+			}
+		} else
+			sample[x]++;
+	}
+
+	progress(SIGALRM);
+	complete = true;
+	fputc('\n', stderr);
+}
+
+static void
+analyze_sample(const unsigned char *sample, long long tries,  uint32_t threshold)
+{
+	double discrepancy, average, variance, total;
+	long long histogram[UCHAR_MAX + 1] = { 0 }, sum, n, median;
+	uint32_t i, i_min, i_max;
+	int min, max;
+
+	printf("distribution properties:\n");
+
+	/* find median, average, deviation, smallest, and largest bucket */
+	total = 0.0;
+	for (i = 0; i < threshold; i++) {
+		histogram[sample[i]]++;
+		total += (double)i * sample[i];
+	}
+
+	average = total / tries;
+
+	variance = 0.0;
+	median = threshold;
+	n = 0;
+	i_min = 0;
+	i_max = 0;
+	min = sample[i_min];
+	max = sample[i_max];
+
+	for (i = 0; i < threshold; i++) {
+		discrepancy = i - average;
+		variance += sample[i] * discrepancy * discrepancy;
+
+		n += sample[i];
+		if (median == threshold && n > tries / 2)
+			median = i;
+
+		if (sample[i] < min) {
+			i_min = i;
+			min = sample[i_min];
+		} else if (sample[i] > max) {
+			i_max = i;
+			max = sample[i_max];
+		}
+	}
+
+	variance /= tries;
+	assert(median < threshold);
+
+	printf("\tthreshold:	%lu\n", (unsigned long)threshold);
+	printf("\tobservations:	%lld\n", tries);
+	printf("\tleast common:	%lu (%d observations)\n", (unsigned long)i_min, min);
+	printf("\tmost common:	%lu (%d observations)\n", (unsigned long)i_max, max);
+	printf("\tmedian:		%lld (expected %lu)\n", median, (unsigned long)threshold / 2);
+	printf("\taverage:	%f (expected %f)\n", average, 0.5 * (threshold - 1));
+	printf("\tdeviation:	%f (expected %f)\n\n", sqrt(variance),
+	    sqrt(((double)threshold * threshold - 1.0) / 12));
+
+	/* build histogram and analyze it */
+	printf("sample properties:\n");
+
+	/* find median, average, and deviation */
+	average = (double)tries / threshold;
+
+	variance = 0.0;
+	for (i = 0; i < UCHAR_MAX; i++) {
+		discrepancy = i - average;
+		variance += histogram[i] * discrepancy * discrepancy;
+	}
+
+	variance /= threshold;
+
+	n = 0;
+	median = UCHAR_MAX + 1;
+	for (i = 0; i <= UCHAR_MAX; i++) {
+		n += histogram[i];
+		if (n >= threshold / 2) {
+			median = i;
+			break;
+		}
+	}
+
+	assert(median <= UCHAR_MAX); /* unreachable */
+
+	printf("\tmedian:		%lld\n", median);
+	printf("\taverage:	%f\n", average);
+	printf("\tdeviation:	%f (expected %f)\n\n", sqrt(variance), sqrt(average * (1.0 - 1.0 / threshold)));
+
+	printf("histogram:\n");
+	for (i = 0; i < 256; i++)
+		if (histogram[i] != 0)
+			printf("\t%3d:\t%lld\n", (int)i, histogram[i]);
+}