commit: 4bf578acbaab8c77b2c137188fd540d2e55ff573
parent bf534b0ea743b73e32e1d65eb87d67273318d23b
Author: Haelwenn (lanodan) Monnier <contact@hacktivis.me>
Date: Fri, 23 Aug 2024 22:33:31 +0200
cmd/cut: new
Diffstat:
7 files changed, 529 insertions(+), 3 deletions(-)
diff --git a/cmd/cut.1 b/cmd/cut.1
@@ -0,0 +1,72 @@
+.\" utils-std: Collection of commonly available Unix tools
+.\" Copyright 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+.\" SPDX-License-Identifier: MPL-2.0
+.Dd 2024-08-24
+.Dt CUT 1
+.Os
+.Sh NAME
+.Nm cut
+.Nd cut each line with selected characters/fields
+.Sh SYNOPSIS
+.Nm
+.Fl b Ar list
+.Op Fl n
+.Op Ar file...
+.Nm
+.Fl c Ar list
+.Op Ar file...
+.Nm
+.Fl f Ar list
+.Op Fl d Ar delim
+.Op Fl s
+.Op Ar file...
+.Sh DESCRIPTION
+.Nm
+reads lines from each each
+.Ar file
+or if unspecified standard input,
+and cuts out bytes
+.Pq Fl b ,
+characters
+.Pq Fl c ,
+or character-delimited fields
+.Pq Fl f .
+.Pp
+The
+.Ar list
+argument is a comma-separated list of 1-based ranges, where for example
+.Ql 1,2,3
+and
+.Ql 1-3
+are equivalents.
+.Sh OPTIONS
+.Bl -tag -width _d_delim
+.It Fl b Ar list
+Cut bytes based on
+.Ar list .
+.It Fl c Ar list
+Cut codepoints based on
+.Ar list .
+.It Fl d Ar delim
+Set the field delimiter to the character
+.Ar delim .
+(default: \t)
+.It Fl f Ar list
+Cut fields based on
+.Ar list .
+.It Fl n
+Do not split codepoints. (Currently unsupported in this implementation)
+.It Fl s
+Suppress lines with no delimiter characters.
+.El
+.Sh EXIT STATUS
+.Ex -std
+.Sh STANDARDS
+Except for the lack of support for
+.Fl n ,
+.Nm
+should be compliant with the
+IEEE Std 1003.1-2024 (“POSIX.1”)
+specification.
+.Sh AUTHORS
+.An Haelwenn (lanodan) Monnier Aq Mt contact+utils@hacktivis.me
diff --git a/cmd/cut.c b/cmd/cut.c
@@ -0,0 +1,421 @@
+// utils-std: Collection of commonly available Unix tools
+// SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+// SPDX-License-Identifier: MPL-2.0
+
+#define _POSIX_C_SOURCE 202405L
+
+#include "../lib/reallocarray.h"
+
+#include <errno.h>
+#include <locale.h>
+#include <stdbool.h>
+#include <stdint.h> // size_t
+#include <stdio.h> // fprintf, fopen
+#include <string.h> // strerror
+#include <unistd.h> // getopt
+#include <wchar.h>
+
+#undef MIN
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+enum cut_mode
+{
+ CUT_MODE_NONE = 0,
+ CUT_MODE_B = 1,
+ CUT_MODE_C = 2,
+ CUT_MODE_F = 3,
+};
+
+char delim = '\t';
+bool opt_n = false, opt_s = false;
+enum cut_mode mode = CUT_MODE_NONE;
+bool *list = NULL;
+size_t list_len = 0;
+
+static ssize_t
+parse_list_num(char **s)
+{
+ char *endptr = NULL;
+ errno = 0;
+ size_t n = strtoul(*s, &endptr, 10);
+ if(errno != 0)
+ {
+ fprintf(stderr, "cut: Error while parsing '%s' as a number: %s\n", *s, strerror(errno));
+ return -1;
+ }
+
+ if(n < 1)
+ {
+ fprintf(stderr, "cut: Invalid number in list: %zu\n", n);
+ return -1;
+ }
+
+ if(endptr != NULL && strchr(",-", *endptr) == NULL)
+ {
+ fprintf(stderr, "cut: Invalid character in list: %c\n", *endptr);
+ return -1;
+ }
+
+ *s = endptr;
+
+ return n;
+}
+
+static int
+parse_list(char *s)
+{
+ while(true)
+ {
+ if(s == NULL || *s == '\0') break;
+
+ if(*s == ',')
+ {
+ fprintf(stderr, "cut: Error: empty list element\n");
+ return -1;
+ }
+
+ ssize_t min = parse_list_num(&s);
+ if(min < 0) return -1;
+
+ // min-- as cut(1) is 1-indexed and max needs to be at least min+1
+ ssize_t max = min--;
+
+ if(s && *s == '-')
+ {
+ s++;
+ max = parse_list_num(&s);
+ if(max < 0) return -1;
+
+ if(max < min)
+ {
+ fprintf(stderr, "cut: Error: decreasing range: %zu-%zu\n", min, max);
+ return -1;
+ }
+ }
+
+ // Needs to be after *s == '-'
+ if(s && *s == ',') s++;
+
+ if((size_t)max > list_len)
+ {
+ list = reallocarray(list, max, sizeof(*list));
+ if(list == NULL)
+ {
+ fprintf(stderr, "cut: Memory allocation error: %s\n", strerror(errno));
+ return -1;
+ }
+
+ if((size_t)min > list_len)
+ {
+ memset(list + list_len, 0, min - list_len);
+ }
+
+ list_len = max;
+ }
+
+ memset(list + min, 1, max - min);
+ }
+
+ if(list_len == 0)
+ {
+ fprintf(stderr, "cut: Error: empty list\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+cut_b(FILE *in, char *filename)
+{
+ char *line = NULL;
+ size_t line_len = 0;
+ int err = 0;
+
+ while(err == 0)
+ {
+ errno = 0;
+ ssize_t nread = getline(&line, &line_len, in);
+
+ if(nread < 0)
+ {
+ if(errno != 0)
+ {
+ fprintf(stderr, "cut: Error while reading file '%s': %s\n", filename, strerror(errno));
+ err = 1;
+ }
+ break;
+ }
+
+ if(nread == 0)
+ {
+ fputc('\n', stdout);
+ continue;
+ }
+
+ for(size_t i = 0; i < MIN(list_len, (size_t)nread); i++)
+ if(list[i]) fputc(line[i], stdout);
+
+ fputc('\n', stdout);
+ }
+
+ if(line_len != 0) free(line);
+
+ return 0;
+}
+
+static int
+cut_c(FILE *in, char *filename)
+{
+ char *line = NULL;
+ size_t line_len = 0;
+ int err = 0;
+ wchar_t *line_w = NULL;
+ ssize_t line_wsz = 0;
+
+ while(err == 0)
+ {
+ errno = 0;
+ ssize_t nread = getline(&line, &line_len, in);
+
+ if(nread < 0)
+ {
+ if(errno != 0)
+ {
+ fprintf(stderr, "cut: Error while reading file '%s': %s\n", filename, strerror(errno));
+ err = 1;
+ }
+ break;
+ }
+
+ if(nread == 0)
+ {
+ fputc('\n', stdout);
+ continue;
+ }
+
+ if(nread > line_wsz)
+ {
+ line_w = reallocarray(line_w, nread, sizeof(*line_w));
+ if(line_w == NULL)
+ {
+ fprintf(stderr, "cut: Memory allocation error: %s\n", strerror(errno));
+ return -1;
+ }
+ line_wsz = nread;
+ }
+
+ size_t wcread = mbstowcs(line_w, line, line_wsz);
+ if(wcread == (size_t)-1)
+ {
+ fprintf(stderr,
+ "cut: Error while parsing characters in file '%s': %s\n",
+ filename,
+ strerror(errno));
+ err = 1;
+ break;
+ }
+
+ //DEBUG fprintf(stderr, "cut: mbstowcs(_, _, %zu) => %zu\n", nread, wcread);
+
+ for(size_t i = 0; i < MIN(list_len, wcread); i++)
+ if(list[i]) fputwc(line_w[i], stdout);
+
+ fputc('\n', stdout);
+ }
+
+ if(line_len != 0) free(line);
+
+ return 0;
+}
+
+static int
+cut_f(FILE *in, char *filename)
+{
+ char *line = NULL;
+ size_t line_len = 0;
+ int err = 0;
+
+ while(err == 0)
+ {
+ errno = 0;
+ ssize_t nread = getline(&line, &line_len, in);
+
+ if(nread < 0)
+ {
+ if(errno != 0)
+ {
+ fprintf(stderr, "cut: Error while reading file '%s': %s\n", filename, strerror(errno));
+ err = 1;
+ }
+ break;
+ }
+
+ if(nread == 0)
+ {
+ fputc('\n', stdout);
+ continue;
+ }
+
+ if(line[nread - 1] == '\n') line[--nread] = '\0';
+
+ if(opt_s && strchr(line, delim) == NULL) continue;
+
+ bool need_sep = false;
+ size_t i = 0;
+ char *toks = NULL;
+ for(char *c = strtok_r(line, &delim, &toks); c != NULL; c = strtok_r(NULL, &delim, &toks), i++)
+ {
+ if(list[i])
+ {
+ if(need_sep) fputc(delim, stdout);
+
+ fputs(c, stdout);
+ need_sep = true;
+ }
+ }
+
+ fputc('\n', stdout);
+ }
+
+ if(line_len != 0) free(line);
+
+ return 0;
+}
+
+static int
+cut(FILE *in, char *filename)
+{
+ switch(mode)
+ {
+ case CUT_MODE_NONE:
+ fprintf(stderr, "cut: No action (-b, -c, -f) specified\n");
+ return 1;
+ case CUT_MODE_B:
+ return cut_b(in, filename);
+ case CUT_MODE_C:
+ return cut_c(in, filename);
+ case CUT_MODE_F:
+ return cut_f(in, filename);
+ default:
+ abort();
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ char *opt_list = NULL;
+
+ errno = 0;
+ setlocale(LC_ALL, "");
+ if(errno != 0)
+ {
+ fprintf(stderr, "date: Warning: Failed to initialize locales: %s\n", strerror(errno));
+ errno = 0;
+ }
+
+ int c = -1;
+ while((c = getopt(argc, argv, ":b:c:d:f:ns")) != -1)
+ {
+ switch(c)
+ {
+ case 'b':
+ if(opt_list != NULL)
+ {
+ fprintf(stderr, "cut: Only one list may be specified\n");
+ return 1;
+ }
+ mode = CUT_MODE_B;
+ opt_list = optarg;
+ break;
+ case 'c':
+ if(opt_list != NULL)
+ {
+ fprintf(stderr, "cut: Only one list may be specified\n");
+ return 1;
+ }
+ mode = CUT_MODE_C;
+ opt_list = optarg;
+ break;
+ case 'f':
+ if(opt_list != NULL)
+ {
+ fprintf(stderr, "cut: Only one list may be specified\n");
+ return 1;
+ }
+ mode = CUT_MODE_F;
+ opt_list = optarg;
+ break;
+ case 'd':
+ if(optarg[0] == '\0')
+ {
+ fprintf(stderr, "cut: Option '-d' requires a character, got an empty string\n");
+ return 1;
+ }
+ if(optarg[1] != '\0')
+ {
+ fprintf(stderr, "cut: Option '-d' only accepts single characters, got \"%s\"\n", optarg);
+ return 1;
+ }
+ delim = optarg[0];
+ break;
+ case 'n':
+ opt_n = true;
+ break;
+ case 's':
+ opt_s = true;
+ break;
+ case ':':
+ fprintf(stderr, "cut: Option '-%c' requires an operand\n", optopt);
+ return 1;
+ default:
+ fprintf(stderr, "cut: Unhandled option '-%c'\n", optopt);
+ return 1;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if(mode == CUT_MODE_NONE)
+ {
+ fprintf(stderr, "cut: No action (-b, -c, -f) specified\n");
+ return 1;
+ }
+
+ if(parse_list(opt_list) < 0) return 1;
+
+#if 0
+ fprintf(stderr, "[DEBUG] list: ");
+ for(size_t i = 0; i < list_len; i++)
+ {
+ fputc(list[i] ? '1' : '0', stderr);
+ }
+ fputc('\n', stderr);
+#endif
+
+ if(argc <= 0) return cut(stdin, "<stdin>");
+
+ for(int i = 0; i < argc; i++)
+ {
+ FILE *in = fopen(argv[i], "r");
+ if(in == NULL)
+ {
+ fprintf(stderr, "cut: Failed opening file '%s': %s\n", argv[i], strerror(errno));
+ return 1;
+ }
+
+ int ret = cut(in, argv[i]);
+
+ if(fclose(in) < 0)
+ {
+ fprintf(stderr, "cut: Failed closing file '%s': %s\n", argv[i], strerror(errno));
+ return 1;
+ }
+
+ if(ret != 0) return 1;
+ }
+
+ return 0;
+}
diff --git a/coreutils.txt b/coreutils.txt
@@ -17,7 +17,7 @@ cksum: Done
comm: Maybe, POSIX
cp: Todo
csplit: No. Considered obsolete
-cut: Todo
+cut: Done
date: Done
dd: No. non-Unix
df: Done
diff --git a/lsb_commands.txt b/lsb_commands.txt
@@ -21,7 +21,7 @@ cp: Todo
cpio: out of scope
crontab: out of scope
csplit: No. Considered obsolete
-cut: Todo
+cut: Done
date: Done
dd: No. non-Unix
df: Done
diff --git a/posix_utilities.txt b/posix_utilities.txt
@@ -30,7 +30,7 @@ cp
crontab: no, external
csplit
ctags: no, toolchain XOPEN_UNIX
-cut
+cut: done
cxref: no, toolchain XOPEN_UNIX
date: done
dd
diff --git a/test-cmd/Kyuafile b/test-cmd/Kyuafile
@@ -17,6 +17,7 @@ tap_test_program{name="cat.sh", required_files=basedir.."/cmd/cat", timeout=1}
tap_test_program{name="chmod.sh", required_files=basedir.."/cmd/chmod", timeout=1}
tap_test_program{name="cksum.sh", required_files=basedir.."/cmd/cksum", timeout=1}
tap_test_program{name="cmp.sh", required_files=basedir.."/cmd/cmp", timeout=1}
+tap_test_program{name="cut.sh", required_files=basedir.."/cmd/cut", timeout=1}
tap_test_program{name="date.sh", required_files=basedir.."/cmd/date", timeout=1}
tap_test_program{name="dirname.sh", required_files=basedir.."/cmd/dirname", timeout=1}
tap_test_program{name="echo.sh", required_files=basedir.."/cmd/echo", timeout=1}
diff --git a/test-cmd/cut.sh b/test-cmd/cut.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+# SPDX-FileCopyrightText: 2017 Haelwenn (lanodan) Monnier <contact+utils@hacktivis.me>
+# SPDX-License-Identifier: MPL-2.0
+
+WD="$(dirname "$0")/../"
+target="${WD}/cmd/cut"
+plans=6
+. "${WD}/test-cmd/tap.sh"
+
+t 'bytes:2-3,10-20,4,12' "-b 2-3,10-20,4,12 ${WD}/test-cmd/inputs/alnum" '1239ABCDEFGHIJ
+'
+t --exit=1 'bytes:,' "-b , ${WD}/test-cmd/inputs/alnum" 'cut: Error: empty list element
+'
+# Example taken from POSIX cut(1)
+t --input='abcdefghi' 'chars:6,2,4-7,1' '-c 6,2,4-7,1' 'abdefg
+'
+
+t --input='aéb' 'widechar' '-c2' 'é
+'
+
+fields='1 2 3 4
+
+a b c'
+
+t --input="$fields" f2 '-f2' '2
+
+b
+'
+
+t --input="$fields" s_f2 '-s -f2' '2
+b
+'