git: 31ced5c14337 - stable/13 - split(1): auto-extend suffix length if required
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 14 Sep 2023 15:00:32 UTC
The branch stable/13 has been updated by des: URL: https://cgit.FreeBSD.org/src/commit/?id=31ced5c14337a68a2e6bfde06d6f4cd9a465f218 commit 31ced5c14337a68a2e6bfde06d6f4cd9a465f218 Author: Jan Schaumann <jschauma@netmeister.org> AuthorDate: 2023-05-30 12:55:38 +0000 Commit: Dag-Erling Smørgrav <des@FreeBSD.org> CommitDate: 2023-09-14 14:59:52 +0000 split(1): auto-extend suffix length if required If the input cannot be split into the number of files resulting from the default suffix length, automatically extend the suffix length rather than bailing out with 'too many files'. Suffixes are extended such that the resulting files continue to sort lexically and "cat *" would reproduce the input. For example, splitting a 1M lines file into (default) 1000 lines per file would yield files named 'xaa', 'xab', ..., 'xyy', 'xyz', 'xzaaa', 'xzaab', ..., 'xzanl'. If '-a' is specified, the suffix length is not auto-extended. This behavior matches GNU sort(1) since around version 8.16. Reviewed by: christos Approved by: kevans Different Revision: https://reviews.freebsd.org/D38279 (cherry picked from commit c4f7198f47c15eece849d06e8fdd1fb46ed43bba) split(1): add '-c' to continue creating files Currently, split(1) will clobber any existing output files: $ split file; ls xaa xab xac xad $ split second-file; ls xaa xab xac xad xae xaf This patch adds a flag "-c" (mnemonic "create, don't overwrite" or "continue where you left off"): $ split file; ls xaa xab xac xad $ split -c second-file; ls xaa xab xac xad xae xaf xag xah xai xaj Reviewed by: christos Approved by: kevans Different Revision: https://reviews.freebsd.org/D38553 (cherry picked from commit ac17fc816e67a4e5e2e481b5001577a8d589f8b6) --- usr.bin/split/split.1 | 28 ++++++++++++++++++------ usr.bin/split/split.c | 59 ++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 73 insertions(+), 14 deletions(-) diff --git a/usr.bin/split/split.1 b/usr.bin/split/split.1 index 6dba7489a83d..67e3c0cd448b 100644 --- a/usr.bin/split/split.1 +++ b/usr.bin/split/split.1 @@ -27,7 +27,7 @@ .\" .\" @(#)split.1 8.3 (Berkeley) 4/16/94 .\" -.Dd April 18, 2023 +.Dd May 26, 2023 .Dt SPLIT 1 .Os .Sh NAME @@ -35,12 +35,12 @@ .Nd split a file into pieces .Sh SYNOPSIS .Nm -.Op Fl d +.Op Fl cd .Op Fl l Ar line_count .Op Fl a Ar suffix_length .Op Ar file Op Ar prefix .Nm -.Op Fl d +.Op Fl cd .Fl b Ar byte_count Ns .Oo .Sm off @@ -50,12 +50,12 @@ .Op Fl a Ar suffix_length .Op Ar file Op Ar prefix .Nm -.Op Fl d +.Op Fl cd .Fl n Ar chunk_count .Op Fl a Ar suffix_length .Op Ar file Op Ar prefix .Nm -.Op Fl d +.Op Fl cd .Fl p Ar pattern .Op Fl a Ar suffix_length .Op Ar file Op Ar prefix @@ -111,6 +111,9 @@ or is appended to the number, the file is split into .Ar byte_count gigabyte pieces. +.It Fl c +Continue creating files and do not overwrite existing +output files. .It Fl d Use a numeric suffix instead of a alphabetic suffix. .It Fl l Ar line_count @@ -150,7 +153,11 @@ characters in the range .Dq Li a Ns - Ns Li z . If .Fl a -is not specified, two letters are used as the suffix. +is not specified, two letters are used as the initial suffix. +If the output does not fit into the resulting number of files and the +.Fl d +flag is not specified, then the suffix length is automatically extended as +needed such that all output files continue to sort in lexical order. .Pp If the .Ar prefix @@ -158,6 +165,15 @@ argument is not specified, the file is split into lexically ordered files named with the prefix .Dq Li x and with suffixes as above. +.Pp +By default, +.Nm +will overwrite any existing output files. +If the +.Fl c +flag is specified, +.Nm +will instead create files with names that do not already exist. .Sh ENVIRONMENT The .Ev LANG , LC_ALL , LC_CTYPE diff --git a/usr.bin/split/split.c b/usr.bin/split/split.c index 29ee0581d071..eeb7d663ecb1 100644 --- a/usr.bin/split/split.c +++ b/usr.bin/split/split.c @@ -65,6 +65,7 @@ static const char sccsid[] = "@(#)split.c 8.2 (Berkeley) 4/16/94"; static off_t bytecnt; /* Byte count to split on. */ static off_t chunks = 0; /* Chunks count to split into. */ +static bool clobber = true; /* Whether to overwrite existing output files. */ static long numlines; /* Line count to split on. */ static int file_open; /* If a file open. */ static int ifd = -1, ofd = -1; /* Input/output file descriptors. */ @@ -73,6 +74,7 @@ static regex_t rgx; static int pflag; static bool dflag; static long sufflen = 2; /* File name suffix length. */ +static int autosfx = 1; /* Whether to auto-extend the suffix length. */ static void newfile(void); static void split1(void); @@ -90,7 +92,7 @@ main(int argc, char **argv) setlocale(LC_ALL, ""); dflag = false; - while ((ch = getopt(argc, argv, "0123456789a:b:dl:n:p:")) != -1) + while ((ch = getopt(argc, argv, "0123456789a:b:cdl:n:p:")) != -1) switch (ch) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -114,6 +116,7 @@ main(int argc, char **argv) if ((sufflen = strtol(optarg, &ep, 10)) <= 0 || *ep) errx(EX_USAGE, "%s: illegal suffix length", optarg); + autosfx = 0; break; case 'b': /* Byte count. */ errno = 0; @@ -121,6 +124,9 @@ main(int argc, char **argv) if (error == -1) errx(EX_USAGE, "%s: offset too large", optarg); break; + case 'c': /* Continue, don't overwrite output files. */ + clobber = false; + break; case 'd': /* Decimal suffix */ dflag = true; break; @@ -343,6 +349,10 @@ newfile(void) static char *fpnt; char beg, end; int pattlen; + int flags = O_WRONLY | O_CREAT | O_TRUNC; + + if (!clobber) + flags |= O_EXCL; if (ofd == -1) { if (fname[0] == '\0') { @@ -351,9 +361,10 @@ newfile(void) } else { fpnt = fname + strlen(fname); } - ofd = fileno(stdout); - } + } else if (close(ofd) != 0) + err(1, "%s", fname); + again: if (dflag) { beg = '0'; end = '9'; @@ -364,6 +375,35 @@ newfile(void) } pattlen = end - beg + 1; + /* + * If '-a' is not specified, then we automatically expand the + * suffix length to accomodate splitting all input. We do this + * by moving the suffix pointer (fpnt) forward and incrementing + * sufflen by one, thereby yielding an additional two characters + * and allowing all output files to sort such that 'cat *' yields + * the input in order. I.e., the order is '... xyy xyz xzaaa + * xzaab ... xzyzy, xzyzz, xzzaaaa, xzzaaab' and so on. + */ + if (!dflag && autosfx && (fpnt[0] == 'y') && + strspn(fpnt+1, "z") == strlen(fpnt+1)) { + fpnt = fname + strlen(fname) - sufflen; + fpnt[sufflen + 2] = '\0'; + fpnt[0] = end; + fpnt[1] = beg; + + /* Basename | Suffix + * before: + * x | yz + * after: + * xz | a.. */ + fpnt++; + sufflen++; + + /* Reset so we start back at all 'a's in our extended suffix. */ + tfnum = 0; + fnum = 0; + } + /* maxfiles = pattlen^sufflen, but don't use libm. */ for (maxfiles = 1, i = 0; i < sufflen; i++) if (LONG_MAX / pattlen < maxfiles) @@ -384,8 +424,11 @@ newfile(void) fpnt[sufflen] = '\0'; ++fnum; - if (!freopen(fname, "w", stdout)) + if ((ofd = open(fname, flags, DEFFILEMODE)) < 0) { + if (!clobber && errno == EEXIST) + goto again; err(EX_IOERR, "%s", fname); + } file_open = 1; } @@ -393,9 +436,9 @@ static void usage(void) { (void)fprintf(stderr, -"usage: split [-d] [-l line_count] [-a suffix_length] [file [prefix]]\n" -" split [-d] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" -" split [-d] -n chunk_count [-a suffix_length] [file [prefix]]\n" -" split [-d] -p pattern [-a suffix_length] [file [prefix]]\n"); +"usage: split [-cd] [-l line_count] [-a suffix_length] [file [prefix]]\n" +" split [-cd] -b byte_count[K|k|M|m|G|g] [-a suffix_length] [file [prefix]]\n" +" split [-cd] -n chunk_count [-a suffix_length] [file [prefix]]\n" +" split [-cd] -p pattern [-a suffix_length] [file [prefix]]\n"); exit(EX_USAGE); }